1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 37 */ 38 39 /* 40 * External virtual filesystem routines 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_ddb.h" 47 #include "opt_watchdog.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/asan.h> 52 #include <sys/bio.h> 53 #include <sys/buf.h> 54 #include <sys/capsicum.h> 55 #include <sys/condvar.h> 56 #include <sys/conf.h> 57 #include <sys/counter.h> 58 #include <sys/dirent.h> 59 #include <sys/event.h> 60 #include <sys/eventhandler.h> 61 #include <sys/extattr.h> 62 #include <sys/file.h> 63 #include <sys/fcntl.h> 64 #include <sys/jail.h> 65 #include <sys/kdb.h> 66 #include <sys/kernel.h> 67 #include <sys/kthread.h> 68 #include <sys/ktr.h> 69 #include <sys/lockf.h> 70 #include <sys/malloc.h> 71 #include <sys/mount.h> 72 #include <sys/namei.h> 73 #include <sys/pctrie.h> 74 #include <sys/priv.h> 75 #include <sys/reboot.h> 76 #include <sys/refcount.h> 77 #include <sys/rwlock.h> 78 #include <sys/sched.h> 79 #include <sys/sleepqueue.h> 80 #include <sys/smr.h> 81 #include <sys/smp.h> 82 #include <sys/stat.h> 83 #include <sys/sysctl.h> 84 #include <sys/syslog.h> 85 #include <sys/vmmeter.h> 86 #include <sys/vnode.h> 87 #include <sys/watchdog.h> 88 89 #include <machine/stdarg.h> 90 91 #include <security/mac/mac_framework.h> 92 93 #include <vm/vm.h> 94 #include <vm/vm_object.h> 95 #include <vm/vm_extern.h> 96 #include <vm/pmap.h> 97 #include <vm/vm_map.h> 98 #include <vm/vm_page.h> 99 #include <vm/vm_kern.h> 100 #include <vm/uma.h> 101 102 #ifdef DDB 103 #include <ddb/ddb.h> 104 #endif 105 106 static void delmntque(struct vnode *vp); 107 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 108 int slpflag, int slptimeo); 109 static void syncer_shutdown(void *arg, int howto); 110 static int vtryrecycle(struct vnode *vp); 111 static void v_init_counters(struct vnode *); 112 static void vn_seqc_init(struct vnode *); 113 static void vn_seqc_write_end_free(struct vnode *vp); 114 static void vgonel(struct vnode *); 115 static bool vhold_recycle_free(struct vnode *); 116 static void vdropl_recycle(struct vnode *vp); 117 static void vdrop_recycle(struct vnode *vp); 118 static void vfs_knllock(void *arg); 119 static void vfs_knlunlock(void *arg); 120 static void vfs_knl_assert_lock(void *arg, int what); 121 static void destroy_vpollinfo(struct vpollinfo *vi); 122 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 123 daddr_t startlbn, daddr_t endlbn); 124 static void vnlru_recalc(void); 125 126 /* 127 * Number of vnodes in existence. Increased whenever getnewvnode() 128 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 129 */ 130 static u_long __exclusive_cache_line numvnodes; 131 132 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 133 "Number of vnodes in existence"); 134 135 static counter_u64_t vnodes_created; 136 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 137 "Number of vnodes created by getnewvnode"); 138 139 /* 140 * Conversion tables for conversion from vnode types to inode formats 141 * and back. 142 */ 143 enum vtype iftovt_tab[16] = { 144 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 145 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 146 }; 147 int vttoif_tab[10] = { 148 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 149 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 150 }; 151 152 /* 153 * List of allocates vnodes in the system. 154 */ 155 static TAILQ_HEAD(freelst, vnode) vnode_list; 156 static struct vnode *vnode_list_free_marker; 157 static struct vnode *vnode_list_reclaim_marker; 158 159 /* 160 * "Free" vnode target. Free vnodes are rarely completely free, but are 161 * just ones that are cheap to recycle. Usually they are for files which 162 * have been stat'd but not read; these usually have inode and namecache 163 * data attached to them. This target is the preferred minimum size of a 164 * sub-cache consisting mostly of such files. The system balances the size 165 * of this sub-cache with its complement to try to prevent either from 166 * thrashing while the other is relatively inactive. The targets express 167 * a preference for the best balance. 168 * 169 * "Above" this target there are 2 further targets (watermarks) related 170 * to recyling of free vnodes. In the best-operating case, the cache is 171 * exactly full, the free list has size between vlowat and vhiwat above the 172 * free target, and recycling from it and normal use maintains this state. 173 * Sometimes the free list is below vlowat or even empty, but this state 174 * is even better for immediate use provided the cache is not full. 175 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 176 * ones) to reach one of these states. The watermarks are currently hard- 177 * coded as 4% and 9% of the available space higher. These and the default 178 * of 25% for wantfreevnodes are too large if the memory size is large. 179 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 180 * whenever vnlru_proc() becomes active. 181 */ 182 static long wantfreevnodes; 183 static long __exclusive_cache_line freevnodes; 184 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 185 &freevnodes, 0, "Number of \"free\" vnodes"); 186 static long freevnodes_old; 187 188 static counter_u64_t recycles_count; 189 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 190 "Number of vnodes recycled to meet vnode cache targets"); 191 192 static counter_u64_t recycles_free_count; 193 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, 194 "Number of free vnodes recycled to meet vnode cache targets"); 195 196 static counter_u64_t deferred_inact; 197 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, 198 "Number of times inactive processing was deferred"); 199 200 /* To keep more than one thread at a time from running vfs_getnewfsid */ 201 static struct mtx mntid_mtx; 202 203 /* 204 * Lock for any access to the following: 205 * vnode_list 206 * numvnodes 207 * freevnodes 208 */ 209 static struct mtx __exclusive_cache_line vnode_list_mtx; 210 211 /* Publicly exported FS */ 212 struct nfs_public nfs_pub; 213 214 static uma_zone_t buf_trie_zone; 215 static smr_t buf_trie_smr; 216 217 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 218 static uma_zone_t vnode_zone; 219 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); 220 221 __read_frequently smr_t vfs_smr; 222 223 /* 224 * The workitem queue. 225 * 226 * It is useful to delay writes of file data and filesystem metadata 227 * for tens of seconds so that quickly created and deleted files need 228 * not waste disk bandwidth being created and removed. To realize this, 229 * we append vnodes to a "workitem" queue. When running with a soft 230 * updates implementation, most pending metadata dependencies should 231 * not wait for more than a few seconds. Thus, mounted on block devices 232 * are delayed only about a half the time that file data is delayed. 233 * Similarly, directory updates are more critical, so are only delayed 234 * about a third the time that file data is delayed. Thus, there are 235 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 236 * one each second (driven off the filesystem syncer process). The 237 * syncer_delayno variable indicates the next queue that is to be processed. 238 * Items that need to be processed soon are placed in this queue: 239 * 240 * syncer_workitem_pending[syncer_delayno] 241 * 242 * A delay of fifteen seconds is done by placing the request fifteen 243 * entries later in the queue: 244 * 245 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 246 * 247 */ 248 static int syncer_delayno; 249 static long syncer_mask; 250 LIST_HEAD(synclist, bufobj); 251 static struct synclist *syncer_workitem_pending; 252 /* 253 * The sync_mtx protects: 254 * bo->bo_synclist 255 * sync_vnode_count 256 * syncer_delayno 257 * syncer_state 258 * syncer_workitem_pending 259 * syncer_worklist_len 260 * rushjob 261 */ 262 static struct mtx sync_mtx; 263 static struct cv sync_wakeup; 264 265 #define SYNCER_MAXDELAY 32 266 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 267 static int syncdelay = 30; /* max time to delay syncing data */ 268 static int filedelay = 30; /* time to delay syncing files */ 269 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 270 "Time to delay syncing files (in seconds)"); 271 static int dirdelay = 29; /* time to delay syncing directories */ 272 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 273 "Time to delay syncing directories (in seconds)"); 274 static int metadelay = 28; /* time to delay syncing metadata */ 275 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 276 "Time to delay syncing metadata (in seconds)"); 277 static int rushjob; /* number of slots to run ASAP */ 278 static int stat_rush_requests; /* number of times I/O speeded up */ 279 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 280 "Number of times I/O speeded up (rush requests)"); 281 282 #define VDBATCH_SIZE 8 283 struct vdbatch { 284 u_int index; 285 long freevnodes; 286 struct mtx lock; 287 struct vnode *tab[VDBATCH_SIZE]; 288 }; 289 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 290 291 static void vdbatch_dequeue(struct vnode *vp); 292 293 /* 294 * When shutting down the syncer, run it at four times normal speed. 295 */ 296 #define SYNCER_SHUTDOWN_SPEEDUP 4 297 static int sync_vnode_count; 298 static int syncer_worklist_len; 299 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 300 syncer_state; 301 302 /* Target for maximum number of vnodes. */ 303 u_long desiredvnodes; 304 static u_long gapvnodes; /* gap between wanted and desired */ 305 static u_long vhiwat; /* enough extras after expansion */ 306 static u_long vlowat; /* minimal extras before expansion */ 307 static u_long vstir; /* nonzero to stir non-free vnodes */ 308 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 309 310 static u_long vnlru_read_freevnodes(void); 311 312 /* 313 * Note that no attempt is made to sanitize these parameters. 314 */ 315 static int 316 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 317 { 318 u_long val; 319 int error; 320 321 val = desiredvnodes; 322 error = sysctl_handle_long(oidp, &val, 0, req); 323 if (error != 0 || req->newptr == NULL) 324 return (error); 325 326 if (val == desiredvnodes) 327 return (0); 328 mtx_lock(&vnode_list_mtx); 329 desiredvnodes = val; 330 wantfreevnodes = desiredvnodes / 4; 331 vnlru_recalc(); 332 mtx_unlock(&vnode_list_mtx); 333 /* 334 * XXX There is no protection against multiple threads changing 335 * desiredvnodes at the same time. Locking above only helps vnlru and 336 * getnewvnode. 337 */ 338 vfs_hash_changesize(desiredvnodes); 339 cache_changesize(desiredvnodes); 340 return (0); 341 } 342 343 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 344 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 345 "LU", "Target for maximum number of vnodes"); 346 347 static int 348 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 349 { 350 u_long val; 351 int error; 352 353 val = wantfreevnodes; 354 error = sysctl_handle_long(oidp, &val, 0, req); 355 if (error != 0 || req->newptr == NULL) 356 return (error); 357 358 if (val == wantfreevnodes) 359 return (0); 360 mtx_lock(&vnode_list_mtx); 361 wantfreevnodes = val; 362 vnlru_recalc(); 363 mtx_unlock(&vnode_list_mtx); 364 return (0); 365 } 366 367 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 368 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 369 "LU", "Target for minimum number of \"free\" vnodes"); 370 371 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 372 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 373 static int vnlru_nowhere; 374 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW | CTLFLAG_STATS, 375 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 376 377 static int 378 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 379 { 380 struct vnode *vp; 381 struct nameidata nd; 382 char *buf; 383 unsigned long ndflags; 384 int error; 385 386 if (req->newptr == NULL) 387 return (EINVAL); 388 if (req->newlen >= PATH_MAX) 389 return (E2BIG); 390 391 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 392 error = SYSCTL_IN(req, buf, req->newlen); 393 if (error != 0) 394 goto out; 395 396 buf[req->newlen] = '\0'; 397 398 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; 399 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); 400 if ((error = namei(&nd)) != 0) 401 goto out; 402 vp = nd.ni_vp; 403 404 if (VN_IS_DOOMED(vp)) { 405 /* 406 * This vnode is being recycled. Return != 0 to let the caller 407 * know that the sysctl had no effect. Return EAGAIN because a 408 * subsequent call will likely succeed (since namei will create 409 * a new vnode if necessary) 410 */ 411 error = EAGAIN; 412 goto putvnode; 413 } 414 415 counter_u64_add(recycles_count, 1); 416 vgone(vp); 417 putvnode: 418 vput(vp); 419 NDFREE_PNBUF(&nd); 420 out: 421 free(buf, M_TEMP); 422 return (error); 423 } 424 425 static int 426 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 427 { 428 struct thread *td = curthread; 429 struct vnode *vp; 430 struct file *fp; 431 int error; 432 int fd; 433 434 if (req->newptr == NULL) 435 return (EBADF); 436 437 error = sysctl_handle_int(oidp, &fd, 0, req); 438 if (error != 0) 439 return (error); 440 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 441 if (error != 0) 442 return (error); 443 vp = fp->f_vnode; 444 445 error = vn_lock(vp, LK_EXCLUSIVE); 446 if (error != 0) 447 goto drop; 448 449 counter_u64_add(recycles_count, 1); 450 vgone(vp); 451 VOP_UNLOCK(vp); 452 drop: 453 fdrop(fp, td); 454 return (error); 455 } 456 457 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 458 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 459 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 460 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 461 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 462 sysctl_ftry_reclaim_vnode, "I", 463 "Try to reclaim a vnode by its file descriptor"); 464 465 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 466 #define vnsz2log 8 467 #ifndef DEBUG_LOCKS 468 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && 469 sizeof(struct vnode) < 1UL << (vnsz2log + 1), 470 "vnsz2log needs to be updated"); 471 #endif 472 473 /* 474 * Support for the bufobj clean & dirty pctrie. 475 */ 476 static void * 477 buf_trie_alloc(struct pctrie *ptree) 478 { 479 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 480 } 481 482 static void 483 buf_trie_free(struct pctrie *ptree, void *node) 484 { 485 uma_zfree_smr(buf_trie_zone, node); 486 } 487 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 488 buf_trie_smr); 489 490 /* 491 * Initialize the vnode management data structures. 492 * 493 * Reevaluate the following cap on the number of vnodes after the physical 494 * memory size exceeds 512GB. In the limit, as the physical memory size 495 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 496 */ 497 #ifndef MAXVNODES_MAX 498 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 499 #endif 500 501 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 502 503 static struct vnode * 504 vn_alloc_marker(struct mount *mp) 505 { 506 struct vnode *vp; 507 508 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 509 vp->v_type = VMARKER; 510 vp->v_mount = mp; 511 512 return (vp); 513 } 514 515 static void 516 vn_free_marker(struct vnode *vp) 517 { 518 519 MPASS(vp->v_type == VMARKER); 520 free(vp, M_VNODE_MARKER); 521 } 522 523 #ifdef KASAN 524 static int 525 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) 526 { 527 kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); 528 return (0); 529 } 530 531 static void 532 vnode_dtor(void *mem, int size, void *arg __unused) 533 { 534 size_t end1, end2, off1, off2; 535 536 _Static_assert(offsetof(struct vnode, v_vnodelist) < 537 offsetof(struct vnode, v_dbatchcpu), 538 "KASAN marks require updating"); 539 540 off1 = offsetof(struct vnode, v_vnodelist); 541 off2 = offsetof(struct vnode, v_dbatchcpu); 542 end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); 543 end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); 544 545 /* 546 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even 547 * after the vnode has been freed. Try to get some KASAN coverage by 548 * marking everything except those two fields as invalid. Because 549 * KASAN's tracking is not byte-granular, any preceding fields sharing 550 * the same 8-byte aligned word must also be marked valid. 551 */ 552 553 /* Handle the area from the start until v_vnodelist... */ 554 off1 = rounddown2(off1, KASAN_SHADOW_SCALE); 555 kasan_mark(mem, off1, off1, KASAN_UMA_FREED); 556 557 /* ... then the area between v_vnodelist and v_dbatchcpu ... */ 558 off1 = roundup2(end1, KASAN_SHADOW_SCALE); 559 off2 = rounddown2(off2, KASAN_SHADOW_SCALE); 560 if (off2 > off1) 561 kasan_mark((void *)((char *)mem + off1), off2 - off1, 562 off2 - off1, KASAN_UMA_FREED); 563 564 /* ... and finally the area from v_dbatchcpu to the end. */ 565 off2 = roundup2(end2, KASAN_SHADOW_SCALE); 566 kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, 567 KASAN_UMA_FREED); 568 } 569 #endif /* KASAN */ 570 571 /* 572 * Initialize a vnode as it first enters the zone. 573 */ 574 static int 575 vnode_init(void *mem, int size, int flags) 576 { 577 struct vnode *vp; 578 579 vp = mem; 580 bzero(vp, size); 581 /* 582 * Setup locks. 583 */ 584 vp->v_vnlock = &vp->v_lock; 585 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 586 /* 587 * By default, don't allow shared locks unless filesystems opt-in. 588 */ 589 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 590 LK_NOSHARE | LK_IS_VNODE); 591 /* 592 * Initialize bufobj. 593 */ 594 bufobj_init(&vp->v_bufobj, vp); 595 /* 596 * Initialize namecache. 597 */ 598 cache_vnode_init(vp); 599 /* 600 * Initialize rangelocks. 601 */ 602 rangelock_init(&vp->v_rl); 603 604 vp->v_dbatchcpu = NOCPU; 605 606 vp->v_state = VSTATE_DEAD; 607 608 /* 609 * Check vhold_recycle_free for an explanation. 610 */ 611 vp->v_holdcnt = VHOLD_NO_SMR; 612 vp->v_type = VNON; 613 mtx_lock(&vnode_list_mtx); 614 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 615 mtx_unlock(&vnode_list_mtx); 616 return (0); 617 } 618 619 /* 620 * Free a vnode when it is cleared from the zone. 621 */ 622 static void 623 vnode_fini(void *mem, int size) 624 { 625 struct vnode *vp; 626 struct bufobj *bo; 627 628 vp = mem; 629 vdbatch_dequeue(vp); 630 mtx_lock(&vnode_list_mtx); 631 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 632 mtx_unlock(&vnode_list_mtx); 633 rangelock_destroy(&vp->v_rl); 634 lockdestroy(vp->v_vnlock); 635 mtx_destroy(&vp->v_interlock); 636 bo = &vp->v_bufobj; 637 rw_destroy(BO_LOCKPTR(bo)); 638 639 kasan_mark(mem, size, size, 0); 640 } 641 642 /* 643 * Provide the size of NFS nclnode and NFS fh for calculation of the 644 * vnode memory consumption. The size is specified directly to 645 * eliminate dependency on NFS-private header. 646 * 647 * Other filesystems may use bigger or smaller (like UFS and ZFS) 648 * private inode data, but the NFS-based estimation is ample enough. 649 * Still, we care about differences in the size between 64- and 32-bit 650 * platforms. 651 * 652 * Namecache structure size is heuristically 653 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 654 */ 655 #ifdef _LP64 656 #define NFS_NCLNODE_SZ (528 + 64) 657 #define NC_SZ 148 658 #else 659 #define NFS_NCLNODE_SZ (360 + 32) 660 #define NC_SZ 92 661 #endif 662 663 static void 664 vntblinit(void *dummy __unused) 665 { 666 struct vdbatch *vd; 667 uma_ctor ctor; 668 uma_dtor dtor; 669 int cpu, physvnodes, virtvnodes; 670 671 /* 672 * Desiredvnodes is a function of the physical memory size and the 673 * kernel's heap size. Generally speaking, it scales with the 674 * physical memory size. The ratio of desiredvnodes to the physical 675 * memory size is 1:16 until desiredvnodes exceeds 98,304. 676 * Thereafter, the 677 * marginal ratio of desiredvnodes to the physical memory size is 678 * 1:64. However, desiredvnodes is limited by the kernel's heap 679 * size. The memory required by desiredvnodes vnodes and vm objects 680 * must not exceed 1/10th of the kernel's heap size. 681 */ 682 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 683 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 684 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 685 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 686 desiredvnodes = min(physvnodes, virtvnodes); 687 if (desiredvnodes > MAXVNODES_MAX) { 688 if (bootverbose) 689 printf("Reducing kern.maxvnodes %lu -> %lu\n", 690 desiredvnodes, MAXVNODES_MAX); 691 desiredvnodes = MAXVNODES_MAX; 692 } 693 wantfreevnodes = desiredvnodes / 4; 694 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 695 TAILQ_INIT(&vnode_list); 696 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 697 /* 698 * The lock is taken to appease WITNESS. 699 */ 700 mtx_lock(&vnode_list_mtx); 701 vnlru_recalc(); 702 mtx_unlock(&vnode_list_mtx); 703 vnode_list_free_marker = vn_alloc_marker(NULL); 704 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 705 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 706 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 707 708 #ifdef KASAN 709 ctor = vnode_ctor; 710 dtor = vnode_dtor; 711 #else 712 ctor = NULL; 713 dtor = NULL; 714 #endif 715 vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, 716 vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); 717 uma_zone_set_smr(vnode_zone, vfs_smr); 718 719 /* 720 * Preallocate enough nodes to support one-per buf so that 721 * we can not fail an insert. reassignbuf() callers can not 722 * tolerate the insertion failure. 723 */ 724 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 725 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 726 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 727 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 728 uma_prealloc(buf_trie_zone, nbuf); 729 730 vnodes_created = counter_u64_alloc(M_WAITOK); 731 recycles_count = counter_u64_alloc(M_WAITOK); 732 recycles_free_count = counter_u64_alloc(M_WAITOK); 733 deferred_inact = counter_u64_alloc(M_WAITOK); 734 735 /* 736 * Initialize the filesystem syncer. 737 */ 738 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 739 &syncer_mask); 740 syncer_maxdelay = syncer_mask + 1; 741 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 742 cv_init(&sync_wakeup, "syncer"); 743 744 CPU_FOREACH(cpu) { 745 vd = DPCPU_ID_PTR((cpu), vd); 746 bzero(vd, sizeof(*vd)); 747 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 748 } 749 } 750 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 751 752 /* 753 * Mark a mount point as busy. Used to synchronize access and to delay 754 * unmounting. Eventually, mountlist_mtx is not released on failure. 755 * 756 * vfs_busy() is a custom lock, it can block the caller. 757 * vfs_busy() only sleeps if the unmount is active on the mount point. 758 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 759 * vnode belonging to mp. 760 * 761 * Lookup uses vfs_busy() to traverse mount points. 762 * root fs var fs 763 * / vnode lock A / vnode lock (/var) D 764 * /var vnode lock B /log vnode lock(/var/log) E 765 * vfs_busy lock C vfs_busy lock F 766 * 767 * Within each file system, the lock order is C->A->B and F->D->E. 768 * 769 * When traversing across mounts, the system follows that lock order: 770 * 771 * C->A->B 772 * | 773 * +->F->D->E 774 * 775 * The lookup() process for namei("/var") illustrates the process: 776 * 1. VOP_LOOKUP() obtains B while A is held 777 * 2. vfs_busy() obtains a shared lock on F while A and B are held 778 * 3. vput() releases lock on B 779 * 4. vput() releases lock on A 780 * 5. VFS_ROOT() obtains lock on D while shared lock on F is held 781 * 6. vfs_unbusy() releases shared lock on F 782 * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 783 * Attempt to lock A (instead of vp_crossmp) while D is held would 784 * violate the global order, causing deadlocks. 785 * 786 * dounmount() locks B while F is drained. Note that for stacked 787 * filesystems, D and B in the example above may be the same lock, 788 * which introdues potential lock order reversal deadlock between 789 * dounmount() and step 5 above. These filesystems may avoid the LOR 790 * by setting VV_CROSSLOCK on the covered vnode so that lock B will 791 * remain held until after step 5. 792 */ 793 int 794 vfs_busy(struct mount *mp, int flags) 795 { 796 struct mount_pcpu *mpcpu; 797 798 MPASS((flags & ~MBF_MASK) == 0); 799 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 800 801 if (vfs_op_thread_enter(mp, mpcpu)) { 802 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 803 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 804 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 805 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 806 vfs_mp_count_add_pcpu(mpcpu, lockref, 1); 807 vfs_op_thread_exit(mp, mpcpu); 808 if (flags & MBF_MNTLSTLOCK) 809 mtx_unlock(&mountlist_mtx); 810 return (0); 811 } 812 813 MNT_ILOCK(mp); 814 vfs_assert_mount_counters(mp); 815 MNT_REF(mp); 816 /* 817 * If mount point is currently being unmounted, sleep until the 818 * mount point fate is decided. If thread doing the unmounting fails, 819 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 820 * that this mount point has survived the unmount attempt and vfs_busy 821 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 822 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 823 * about to be really destroyed. vfs_busy needs to release its 824 * reference on the mount point in this case and return with ENOENT, 825 * telling the caller the mount it tried to busy is no longer valid. 826 */ 827 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 828 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), 829 ("%s: non-empty upper mount list with pending unmount", 830 __func__)); 831 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 832 MNT_REL(mp); 833 MNT_IUNLOCK(mp); 834 CTR1(KTR_VFS, "%s: failed busying before sleeping", 835 __func__); 836 return (ENOENT); 837 } 838 if (flags & MBF_MNTLSTLOCK) 839 mtx_unlock(&mountlist_mtx); 840 mp->mnt_kern_flag |= MNTK_MWAIT; 841 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 842 if (flags & MBF_MNTLSTLOCK) 843 mtx_lock(&mountlist_mtx); 844 MNT_ILOCK(mp); 845 } 846 if (flags & MBF_MNTLSTLOCK) 847 mtx_unlock(&mountlist_mtx); 848 mp->mnt_lockref++; 849 MNT_IUNLOCK(mp); 850 return (0); 851 } 852 853 /* 854 * Free a busy filesystem. 855 */ 856 void 857 vfs_unbusy(struct mount *mp) 858 { 859 struct mount_pcpu *mpcpu; 860 int c; 861 862 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 863 864 if (vfs_op_thread_enter(mp, mpcpu)) { 865 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 866 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); 867 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 868 vfs_op_thread_exit(mp, mpcpu); 869 return; 870 } 871 872 MNT_ILOCK(mp); 873 vfs_assert_mount_counters(mp); 874 MNT_REL(mp); 875 c = --mp->mnt_lockref; 876 if (mp->mnt_vfs_ops == 0) { 877 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 878 MNT_IUNLOCK(mp); 879 return; 880 } 881 if (c < 0) 882 vfs_dump_mount_counters(mp); 883 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 884 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 885 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 886 mp->mnt_kern_flag &= ~MNTK_DRAINING; 887 wakeup(&mp->mnt_lockref); 888 } 889 MNT_IUNLOCK(mp); 890 } 891 892 /* 893 * Lookup a mount point by filesystem identifier. 894 */ 895 struct mount * 896 vfs_getvfs(fsid_t *fsid) 897 { 898 struct mount *mp; 899 900 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 901 mtx_lock(&mountlist_mtx); 902 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 903 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 904 vfs_ref(mp); 905 mtx_unlock(&mountlist_mtx); 906 return (mp); 907 } 908 } 909 mtx_unlock(&mountlist_mtx); 910 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 911 return ((struct mount *) 0); 912 } 913 914 /* 915 * Lookup a mount point by filesystem identifier, busying it before 916 * returning. 917 * 918 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 919 * cache for popular filesystem identifiers. The cache is lockess, using 920 * the fact that struct mount's are never freed. In worst case we may 921 * get pointer to unmounted or even different filesystem, so we have to 922 * check what we got, and go slow way if so. 923 */ 924 struct mount * 925 vfs_busyfs(fsid_t *fsid) 926 { 927 #define FSID_CACHE_SIZE 256 928 typedef struct mount * volatile vmp_t; 929 static vmp_t cache[FSID_CACHE_SIZE]; 930 struct mount *mp; 931 int error; 932 uint32_t hash; 933 934 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 935 hash = fsid->val[0] ^ fsid->val[1]; 936 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 937 mp = cache[hash]; 938 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 939 goto slow; 940 if (vfs_busy(mp, 0) != 0) { 941 cache[hash] = NULL; 942 goto slow; 943 } 944 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 945 return (mp); 946 else 947 vfs_unbusy(mp); 948 949 slow: 950 mtx_lock(&mountlist_mtx); 951 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 952 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 953 error = vfs_busy(mp, MBF_MNTLSTLOCK); 954 if (error) { 955 cache[hash] = NULL; 956 mtx_unlock(&mountlist_mtx); 957 return (NULL); 958 } 959 cache[hash] = mp; 960 return (mp); 961 } 962 } 963 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 964 mtx_unlock(&mountlist_mtx); 965 return ((struct mount *) 0); 966 } 967 968 /* 969 * Check if a user can access privileged mount options. 970 */ 971 int 972 vfs_suser(struct mount *mp, struct thread *td) 973 { 974 int error; 975 976 if (jailed(td->td_ucred)) { 977 /* 978 * If the jail of the calling thread lacks permission for 979 * this type of file system, deny immediately. 980 */ 981 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 982 return (EPERM); 983 984 /* 985 * If the file system was mounted outside the jail of the 986 * calling thread, deny immediately. 987 */ 988 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 989 return (EPERM); 990 } 991 992 /* 993 * If file system supports delegated administration, we don't check 994 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 995 * by the file system itself. 996 * If this is not the user that did original mount, we check for 997 * the PRIV_VFS_MOUNT_OWNER privilege. 998 */ 999 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 1000 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 1001 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 1002 return (error); 1003 } 1004 return (0); 1005 } 1006 1007 /* 1008 * Get a new unique fsid. Try to make its val[0] unique, since this value 1009 * will be used to create fake device numbers for stat(). Also try (but 1010 * not so hard) make its val[0] unique mod 2^16, since some emulators only 1011 * support 16-bit device numbers. We end up with unique val[0]'s for the 1012 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 1013 * 1014 * Keep in mind that several mounts may be running in parallel. Starting 1015 * the search one past where the previous search terminated is both a 1016 * micro-optimization and a defense against returning the same fsid to 1017 * different mounts. 1018 */ 1019 void 1020 vfs_getnewfsid(struct mount *mp) 1021 { 1022 static uint16_t mntid_base; 1023 struct mount *nmp; 1024 fsid_t tfsid; 1025 int mtype; 1026 1027 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 1028 mtx_lock(&mntid_mtx); 1029 mtype = mp->mnt_vfc->vfc_typenum; 1030 tfsid.val[1] = mtype; 1031 mtype = (mtype & 0xFF) << 24; 1032 for (;;) { 1033 tfsid.val[0] = makedev(255, 1034 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 1035 mntid_base++; 1036 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 1037 break; 1038 vfs_rel(nmp); 1039 } 1040 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 1041 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 1042 mtx_unlock(&mntid_mtx); 1043 } 1044 1045 /* 1046 * Knob to control the precision of file timestamps: 1047 * 1048 * 0 = seconds only; nanoseconds zeroed. 1049 * 1 = seconds and nanoseconds, accurate within 1/HZ. 1050 * 2 = seconds and nanoseconds, truncated to microseconds. 1051 * >=3 = seconds and nanoseconds, maximum precision. 1052 */ 1053 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 1054 1055 static int timestamp_precision = TSP_USEC; 1056 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 1057 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 1058 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 1059 "3+: sec + ns (max. precision))"); 1060 1061 /* 1062 * Get a current timestamp. 1063 */ 1064 void 1065 vfs_timestamp(struct timespec *tsp) 1066 { 1067 struct timeval tv; 1068 1069 switch (timestamp_precision) { 1070 case TSP_SEC: 1071 tsp->tv_sec = time_second; 1072 tsp->tv_nsec = 0; 1073 break; 1074 case TSP_HZ: 1075 getnanotime(tsp); 1076 break; 1077 case TSP_USEC: 1078 microtime(&tv); 1079 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1080 break; 1081 case TSP_NSEC: 1082 default: 1083 nanotime(tsp); 1084 break; 1085 } 1086 } 1087 1088 /* 1089 * Set vnode attributes to VNOVAL 1090 */ 1091 void 1092 vattr_null(struct vattr *vap) 1093 { 1094 1095 vap->va_type = VNON; 1096 vap->va_size = VNOVAL; 1097 vap->va_bytes = VNOVAL; 1098 vap->va_mode = VNOVAL; 1099 vap->va_nlink = VNOVAL; 1100 vap->va_uid = VNOVAL; 1101 vap->va_gid = VNOVAL; 1102 vap->va_fsid = VNOVAL; 1103 vap->va_fileid = VNOVAL; 1104 vap->va_blocksize = VNOVAL; 1105 vap->va_rdev = VNOVAL; 1106 vap->va_atime.tv_sec = VNOVAL; 1107 vap->va_atime.tv_nsec = VNOVAL; 1108 vap->va_mtime.tv_sec = VNOVAL; 1109 vap->va_mtime.tv_nsec = VNOVAL; 1110 vap->va_ctime.tv_sec = VNOVAL; 1111 vap->va_ctime.tv_nsec = VNOVAL; 1112 vap->va_birthtime.tv_sec = VNOVAL; 1113 vap->va_birthtime.tv_nsec = VNOVAL; 1114 vap->va_flags = VNOVAL; 1115 vap->va_gen = VNOVAL; 1116 vap->va_vaflags = 0; 1117 } 1118 1119 /* 1120 * Try to reduce the total number of vnodes. 1121 * 1122 * This routine (and its user) are buggy in at least the following ways: 1123 * - all parameters were picked years ago when RAM sizes were significantly 1124 * smaller 1125 * - it can pick vnodes based on pages used by the vm object, but filesystems 1126 * like ZFS don't use it making the pick broken 1127 * - since ZFS has its own aging policy it gets partially combated by this one 1128 * - a dedicated method should be provided for filesystems to let them decide 1129 * whether the vnode should be recycled 1130 * 1131 * This routine is called when we have too many vnodes. It attempts 1132 * to free <count> vnodes and will potentially free vnodes that still 1133 * have VM backing store (VM backing store is typically the cause 1134 * of a vnode blowout so we want to do this). Therefore, this operation 1135 * is not considered cheap. 1136 * 1137 * A number of conditions may prevent a vnode from being reclaimed. 1138 * the buffer cache may have references on the vnode, a directory 1139 * vnode may still have references due to the namei cache representing 1140 * underlying files, or the vnode may be in active use. It is not 1141 * desirable to reuse such vnodes. These conditions may cause the 1142 * number of vnodes to reach some minimum value regardless of what 1143 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1144 * 1145 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1146 * entries if this argument is strue 1147 * @param trigger Only reclaim vnodes with fewer than this many resident 1148 * pages. 1149 * @param target How many vnodes to reclaim. 1150 * @return The number of vnodes that were reclaimed. 1151 */ 1152 static int 1153 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1154 { 1155 struct vnode *vp, *mvp; 1156 struct mount *mp; 1157 struct vm_object *object; 1158 u_long done; 1159 bool retried; 1160 1161 mtx_assert(&vnode_list_mtx, MA_OWNED); 1162 1163 retried = false; 1164 done = 0; 1165 1166 mvp = vnode_list_reclaim_marker; 1167 restart: 1168 vp = mvp; 1169 while (done < target) { 1170 vp = TAILQ_NEXT(vp, v_vnodelist); 1171 if (__predict_false(vp == NULL)) 1172 break; 1173 1174 if (__predict_false(vp->v_type == VMARKER)) 1175 continue; 1176 1177 /* 1178 * If it's been deconstructed already, it's still 1179 * referenced, or it exceeds the trigger, skip it. 1180 * Also skip free vnodes. We are trying to make space 1181 * to expand the free list, not reduce it. 1182 */ 1183 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1184 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1185 goto next_iter; 1186 1187 if (vp->v_type == VBAD || vp->v_type == VNON) 1188 goto next_iter; 1189 1190 object = atomic_load_ptr(&vp->v_object); 1191 if (object == NULL || object->resident_page_count > trigger) { 1192 goto next_iter; 1193 } 1194 1195 /* 1196 * Handle races against vnode allocation. Filesystems lock the 1197 * vnode some time after it gets returned from getnewvnode, 1198 * despite type and hold count being manipulated earlier. 1199 * Resorting to checking v_mount restores guarantees present 1200 * before the global list was reworked to contain all vnodes. 1201 */ 1202 if (!VI_TRYLOCK(vp)) 1203 goto next_iter; 1204 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1205 VI_UNLOCK(vp); 1206 goto next_iter; 1207 } 1208 if (vp->v_mount == NULL) { 1209 VI_UNLOCK(vp); 1210 goto next_iter; 1211 } 1212 vholdl(vp); 1213 VI_UNLOCK(vp); 1214 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1215 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1216 mtx_unlock(&vnode_list_mtx); 1217 1218 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1219 vdrop_recycle(vp); 1220 goto next_iter_unlocked; 1221 } 1222 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1223 vdrop_recycle(vp); 1224 vn_finished_write(mp); 1225 goto next_iter_unlocked; 1226 } 1227 1228 VI_LOCK(vp); 1229 if (vp->v_usecount > 0 || 1230 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1231 (vp->v_object != NULL && vp->v_object->handle == vp && 1232 vp->v_object->resident_page_count > trigger)) { 1233 VOP_UNLOCK(vp); 1234 vdropl_recycle(vp); 1235 vn_finished_write(mp); 1236 goto next_iter_unlocked; 1237 } 1238 counter_u64_add(recycles_count, 1); 1239 vgonel(vp); 1240 VOP_UNLOCK(vp); 1241 vdropl_recycle(vp); 1242 vn_finished_write(mp); 1243 done++; 1244 next_iter_unlocked: 1245 if (should_yield()) 1246 kern_yield(PRI_USER); 1247 mtx_lock(&vnode_list_mtx); 1248 goto restart; 1249 next_iter: 1250 MPASS(vp->v_type != VMARKER); 1251 if (!should_yield()) 1252 continue; 1253 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1254 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1255 mtx_unlock(&vnode_list_mtx); 1256 kern_yield(PRI_USER); 1257 mtx_lock(&vnode_list_mtx); 1258 goto restart; 1259 } 1260 if (done == 0 && !retried) { 1261 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1262 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1263 retried = true; 1264 goto restart; 1265 } 1266 return (done); 1267 } 1268 1269 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 1270 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 1271 0, 1272 "limit on vnode free requests per call to the vnlru_free routine"); 1273 1274 /* 1275 * Attempt to reduce the free list by the requested amount. 1276 */ 1277 static int 1278 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp) 1279 { 1280 struct vnode *vp; 1281 struct mount *mp; 1282 int ocount; 1283 1284 mtx_assert(&vnode_list_mtx, MA_OWNED); 1285 if (count > max_vnlru_free) 1286 count = max_vnlru_free; 1287 ocount = count; 1288 vp = mvp; 1289 for (;;) { 1290 if (count == 0) { 1291 break; 1292 } 1293 vp = TAILQ_NEXT(vp, v_vnodelist); 1294 if (__predict_false(vp == NULL)) { 1295 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1296 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1297 break; 1298 } 1299 if (__predict_false(vp->v_type == VMARKER)) 1300 continue; 1301 if (vp->v_holdcnt > 0) 1302 continue; 1303 /* 1304 * Don't recycle if our vnode is from different type 1305 * of mount point. Note that mp is type-safe, the 1306 * check does not reach unmapped address even if 1307 * vnode is reclaimed. 1308 */ 1309 if (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1310 mp->mnt_op != mnt_op) { 1311 continue; 1312 } 1313 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1314 continue; 1315 } 1316 if (!vhold_recycle_free(vp)) 1317 continue; 1318 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1319 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1320 mtx_unlock(&vnode_list_mtx); 1321 /* 1322 * FIXME: ignores the return value, meaning it may be nothing 1323 * got recycled but it claims otherwise to the caller. 1324 * 1325 * Originally the value started being ignored in 2005 with 1326 * 114a1006a8204aa156e1f9ad6476cdff89cada7f . 1327 * 1328 * Respecting the value can run into significant stalls if most 1329 * vnodes belong to one file system and it has writes 1330 * suspended. In presence of many threads and millions of 1331 * vnodes they keep contending on the vnode_list_mtx lock only 1332 * to find vnodes they can't recycle. 1333 * 1334 * The solution would be to pre-check if the vnode is likely to 1335 * be recycle-able, but it needs to happen with the 1336 * vnode_list_mtx lock held. This runs into a problem where 1337 * VOP_GETWRITEMOUNT (currently needed to find out about if 1338 * writes are frozen) can take locks which LOR against it. 1339 * 1340 * Check nullfs for one example (null_getwritemount). 1341 */ 1342 vtryrecycle(vp); 1343 count--; 1344 mtx_lock(&vnode_list_mtx); 1345 vp = mvp; 1346 } 1347 return (ocount - count); 1348 } 1349 1350 static int 1351 vnlru_free_locked(int count) 1352 { 1353 1354 mtx_assert(&vnode_list_mtx, MA_OWNED); 1355 return (vnlru_free_impl(count, NULL, vnode_list_free_marker)); 1356 } 1357 1358 void 1359 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) 1360 { 1361 1362 MPASS(mnt_op != NULL); 1363 MPASS(mvp != NULL); 1364 VNPASS(mvp->v_type == VMARKER, mvp); 1365 mtx_lock(&vnode_list_mtx); 1366 vnlru_free_impl(count, mnt_op, mvp); 1367 mtx_unlock(&vnode_list_mtx); 1368 } 1369 1370 struct vnode * 1371 vnlru_alloc_marker(void) 1372 { 1373 struct vnode *mvp; 1374 1375 mvp = vn_alloc_marker(NULL); 1376 mtx_lock(&vnode_list_mtx); 1377 TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); 1378 mtx_unlock(&vnode_list_mtx); 1379 return (mvp); 1380 } 1381 1382 void 1383 vnlru_free_marker(struct vnode *mvp) 1384 { 1385 mtx_lock(&vnode_list_mtx); 1386 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1387 mtx_unlock(&vnode_list_mtx); 1388 vn_free_marker(mvp); 1389 } 1390 1391 static void 1392 vnlru_recalc(void) 1393 { 1394 1395 mtx_assert(&vnode_list_mtx, MA_OWNED); 1396 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1397 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1398 vlowat = vhiwat / 2; 1399 } 1400 1401 /* 1402 * Attempt to recycle vnodes in a context that is always safe to block. 1403 * Calling vlrurecycle() from the bowels of filesystem code has some 1404 * interesting deadlock problems. 1405 */ 1406 static struct proc *vnlruproc; 1407 static int vnlruproc_sig; 1408 1409 /* 1410 * The main freevnodes counter is only updated when threads requeue their vnode 1411 * batches. CPUs are conditionally walked to compute a more accurate total. 1412 * 1413 * Limit how much of a slop are we willing to tolerate. Note: the actual value 1414 * at any given moment can still exceed slop, but it should not be by significant 1415 * margin in practice. 1416 */ 1417 #define VNLRU_FREEVNODES_SLOP 128 1418 1419 static __inline void 1420 vfs_freevnodes_inc(void) 1421 { 1422 struct vdbatch *vd; 1423 1424 critical_enter(); 1425 vd = DPCPU_PTR(vd); 1426 vd->freevnodes++; 1427 critical_exit(); 1428 } 1429 1430 static __inline void 1431 vfs_freevnodes_dec(void) 1432 { 1433 struct vdbatch *vd; 1434 1435 critical_enter(); 1436 vd = DPCPU_PTR(vd); 1437 vd->freevnodes--; 1438 critical_exit(); 1439 } 1440 1441 static u_long 1442 vnlru_read_freevnodes(void) 1443 { 1444 struct vdbatch *vd; 1445 long slop; 1446 int cpu; 1447 1448 mtx_assert(&vnode_list_mtx, MA_OWNED); 1449 if (freevnodes > freevnodes_old) 1450 slop = freevnodes - freevnodes_old; 1451 else 1452 slop = freevnodes_old - freevnodes; 1453 if (slop < VNLRU_FREEVNODES_SLOP) 1454 return (freevnodes >= 0 ? freevnodes : 0); 1455 freevnodes_old = freevnodes; 1456 CPU_FOREACH(cpu) { 1457 vd = DPCPU_ID_PTR((cpu), vd); 1458 freevnodes_old += vd->freevnodes; 1459 } 1460 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1461 } 1462 1463 static bool 1464 vnlru_under(u_long rnumvnodes, u_long limit) 1465 { 1466 u_long rfreevnodes, space; 1467 1468 if (__predict_false(rnumvnodes > desiredvnodes)) 1469 return (true); 1470 1471 space = desiredvnodes - rnumvnodes; 1472 if (space < limit) { 1473 rfreevnodes = vnlru_read_freevnodes(); 1474 if (rfreevnodes > wantfreevnodes) 1475 space += rfreevnodes - wantfreevnodes; 1476 } 1477 return (space < limit); 1478 } 1479 1480 static bool 1481 vnlru_under_unlocked(u_long rnumvnodes, u_long limit) 1482 { 1483 long rfreevnodes, space; 1484 1485 if (__predict_false(rnumvnodes > desiredvnodes)) 1486 return (true); 1487 1488 space = desiredvnodes - rnumvnodes; 1489 if (space < limit) { 1490 rfreevnodes = atomic_load_long(&freevnodes); 1491 if (rfreevnodes > wantfreevnodes) 1492 space += rfreevnodes - wantfreevnodes; 1493 } 1494 return (space < limit); 1495 } 1496 1497 static void 1498 vnlru_kick(void) 1499 { 1500 1501 mtx_assert(&vnode_list_mtx, MA_OWNED); 1502 if (vnlruproc_sig == 0) { 1503 vnlruproc_sig = 1; 1504 wakeup(vnlruproc); 1505 } 1506 } 1507 1508 static void 1509 vnlru_proc(void) 1510 { 1511 u_long rnumvnodes, rfreevnodes, target; 1512 unsigned long onumvnodes; 1513 int done, force, trigger, usevnodes; 1514 bool reclaim_nc_src, want_reread; 1515 1516 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1517 SHUTDOWN_PRI_FIRST); 1518 1519 force = 0; 1520 want_reread = false; 1521 for (;;) { 1522 kproc_suspend_check(vnlruproc); 1523 mtx_lock(&vnode_list_mtx); 1524 rnumvnodes = atomic_load_long(&numvnodes); 1525 1526 if (want_reread) { 1527 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1528 want_reread = false; 1529 } 1530 1531 /* 1532 * If numvnodes is too large (due to desiredvnodes being 1533 * adjusted using its sysctl, or emergency growth), first 1534 * try to reduce it by discarding from the free list. 1535 */ 1536 if (rnumvnodes > desiredvnodes) { 1537 vnlru_free_locked(rnumvnodes - desiredvnodes); 1538 rnumvnodes = atomic_load_long(&numvnodes); 1539 } 1540 /* 1541 * Sleep if the vnode cache is in a good state. This is 1542 * when it is not over-full and has space for about a 4% 1543 * or 9% expansion (by growing its size or inexcessively 1544 * reducing its free list). Otherwise, try to reclaim 1545 * space for a 10% expansion. 1546 */ 1547 if (vstir && force == 0) { 1548 force = 1; 1549 vstir = 0; 1550 } 1551 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1552 vnlruproc_sig = 0; 1553 wakeup(&vnlruproc_sig); 1554 msleep(vnlruproc, &vnode_list_mtx, 1555 PVFS|PDROP, "vlruwt", hz); 1556 continue; 1557 } 1558 rfreevnodes = vnlru_read_freevnodes(); 1559 1560 onumvnodes = rnumvnodes; 1561 /* 1562 * Calculate parameters for recycling. These are the same 1563 * throughout the loop to give some semblance of fairness. 1564 * The trigger point is to avoid recycling vnodes with lots 1565 * of resident pages. We aren't trying to free memory; we 1566 * are trying to recycle or at least free vnodes. 1567 */ 1568 if (rnumvnodes <= desiredvnodes) 1569 usevnodes = rnumvnodes - rfreevnodes; 1570 else 1571 usevnodes = rnumvnodes; 1572 if (usevnodes <= 0) 1573 usevnodes = 1; 1574 /* 1575 * The trigger value is chosen to give a conservatively 1576 * large value to ensure that it alone doesn't prevent 1577 * making progress. The value can easily be so large that 1578 * it is effectively infinite in some congested and 1579 * misconfigured cases, and this is necessary. Normally 1580 * it is about 8 to 100 (pages), which is quite large. 1581 */ 1582 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1583 if (force < 2) 1584 trigger = vsmalltrigger; 1585 reclaim_nc_src = force >= 3; 1586 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1587 target = target / 10 + 1; 1588 done = vlrureclaim(reclaim_nc_src, trigger, target); 1589 mtx_unlock(&vnode_list_mtx); 1590 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1591 uma_reclaim(UMA_RECLAIM_DRAIN); 1592 if (done == 0) { 1593 if (force == 0 || force == 1) { 1594 force = 2; 1595 continue; 1596 } 1597 if (force == 2) { 1598 force = 3; 1599 continue; 1600 } 1601 want_reread = true; 1602 force = 0; 1603 vnlru_nowhere++; 1604 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1605 } else { 1606 want_reread = true; 1607 kern_yield(PRI_USER); 1608 } 1609 } 1610 } 1611 1612 static struct kproc_desc vnlru_kp = { 1613 "vnlru", 1614 vnlru_proc, 1615 &vnlruproc 1616 }; 1617 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1618 &vnlru_kp); 1619 1620 /* 1621 * Routines having to do with the management of the vnode table. 1622 */ 1623 1624 /* 1625 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1626 * before we actually vgone(). This function must be called with the vnode 1627 * held to prevent the vnode from being returned to the free list midway 1628 * through vgone(). 1629 */ 1630 static int 1631 vtryrecycle(struct vnode *vp) 1632 { 1633 struct mount *vnmp; 1634 1635 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1636 VNASSERT(vp->v_holdcnt, vp, 1637 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1638 /* 1639 * This vnode may found and locked via some other list, if so we 1640 * can't recycle it yet. 1641 */ 1642 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1643 CTR2(KTR_VFS, 1644 "%s: impossible to recycle, vp %p lock is already held", 1645 __func__, vp); 1646 vdrop_recycle(vp); 1647 return (EWOULDBLOCK); 1648 } 1649 /* 1650 * Don't recycle if its filesystem is being suspended. 1651 */ 1652 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1653 VOP_UNLOCK(vp); 1654 CTR2(KTR_VFS, 1655 "%s: impossible to recycle, cannot start the write for %p", 1656 __func__, vp); 1657 vdrop_recycle(vp); 1658 return (EBUSY); 1659 } 1660 /* 1661 * If we got this far, we need to acquire the interlock and see if 1662 * anyone picked up this vnode from another list. If not, we will 1663 * mark it with DOOMED via vgonel() so that anyone who does find it 1664 * will skip over it. 1665 */ 1666 VI_LOCK(vp); 1667 if (vp->v_usecount) { 1668 VOP_UNLOCK(vp); 1669 vdropl_recycle(vp); 1670 vn_finished_write(vnmp); 1671 CTR2(KTR_VFS, 1672 "%s: impossible to recycle, %p is already referenced", 1673 __func__, vp); 1674 return (EBUSY); 1675 } 1676 if (!VN_IS_DOOMED(vp)) { 1677 counter_u64_add(recycles_free_count, 1); 1678 vgonel(vp); 1679 } 1680 VOP_UNLOCK(vp); 1681 vdropl_recycle(vp); 1682 vn_finished_write(vnmp); 1683 return (0); 1684 } 1685 1686 /* 1687 * Allocate a new vnode. 1688 * 1689 * The operation never returns an error. Returning an error was disabled 1690 * in r145385 (dated 2005) with the following comment: 1691 * 1692 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1693 * 1694 * Given the age of this commit (almost 15 years at the time of writing this 1695 * comment) restoring the ability to fail requires a significant audit of 1696 * all codepaths. 1697 * 1698 * The routine can try to free a vnode or stall for up to 1 second waiting for 1699 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1700 */ 1701 static u_long vn_alloc_cyclecount; 1702 1703 static struct vnode * __noinline 1704 vn_alloc_hard(struct mount *mp) 1705 { 1706 u_long rnumvnodes, rfreevnodes; 1707 1708 mtx_lock(&vnode_list_mtx); 1709 rnumvnodes = atomic_load_long(&numvnodes); 1710 if (rnumvnodes + 1 < desiredvnodes) { 1711 vn_alloc_cyclecount = 0; 1712 goto alloc; 1713 } 1714 rfreevnodes = vnlru_read_freevnodes(); 1715 if (vn_alloc_cyclecount++ >= rfreevnodes) { 1716 vn_alloc_cyclecount = 0; 1717 vstir = 1; 1718 } 1719 /* 1720 * Grow the vnode cache if it will not be above its target max 1721 * after growing. Otherwise, if the free list is nonempty, try 1722 * to reclaim 1 item from it before growing the cache (possibly 1723 * above its target max if the reclamation failed or is delayed). 1724 * Otherwise, wait for some space. In all cases, schedule 1725 * vnlru_proc() if we are getting short of space. The watermarks 1726 * should be chosen so that we never wait or even reclaim from 1727 * the free list to below its target minimum. 1728 */ 1729 if (vnlru_free_locked(1) > 0) 1730 goto alloc; 1731 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 1732 /* 1733 * Wait for space for a new vnode. 1734 */ 1735 vnlru_kick(); 1736 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 1737 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 1738 vnlru_read_freevnodes() > 1) 1739 vnlru_free_locked(1); 1740 } 1741 alloc: 1742 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1743 if (vnlru_under(rnumvnodes, vlowat)) 1744 vnlru_kick(); 1745 mtx_unlock(&vnode_list_mtx); 1746 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1747 } 1748 1749 static struct vnode * 1750 vn_alloc(struct mount *mp) 1751 { 1752 u_long rnumvnodes; 1753 1754 if (__predict_false(vn_alloc_cyclecount != 0)) 1755 return (vn_alloc_hard(mp)); 1756 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1757 if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { 1758 atomic_subtract_long(&numvnodes, 1); 1759 return (vn_alloc_hard(mp)); 1760 } 1761 1762 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1763 } 1764 1765 static void 1766 vn_free(struct vnode *vp) 1767 { 1768 1769 atomic_subtract_long(&numvnodes, 1); 1770 uma_zfree_smr(vnode_zone, vp); 1771 } 1772 1773 /* 1774 * Return the next vnode from the free list. 1775 */ 1776 int 1777 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1778 struct vnode **vpp) 1779 { 1780 struct vnode *vp; 1781 struct thread *td; 1782 struct lock_object *lo; 1783 1784 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1785 1786 KASSERT(vops->registered, 1787 ("%s: not registered vector op %p\n", __func__, vops)); 1788 1789 td = curthread; 1790 if (td->td_vp_reserved != NULL) { 1791 vp = td->td_vp_reserved; 1792 td->td_vp_reserved = NULL; 1793 } else { 1794 vp = vn_alloc(mp); 1795 } 1796 counter_u64_add(vnodes_created, 1); 1797 1798 vn_set_state(vp, VSTATE_UNINITIALIZED); 1799 1800 /* 1801 * Locks are given the generic name "vnode" when created. 1802 * Follow the historic practice of using the filesystem 1803 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1804 * 1805 * Locks live in a witness group keyed on their name. Thus, 1806 * when a lock is renamed, it must also move from the witness 1807 * group of its old name to the witness group of its new name. 1808 * 1809 * The change only needs to be made when the vnode moves 1810 * from one filesystem type to another. We ensure that each 1811 * filesystem use a single static name pointer for its tag so 1812 * that we can compare pointers rather than doing a strcmp(). 1813 */ 1814 lo = &vp->v_vnlock->lock_object; 1815 #ifdef WITNESS 1816 if (lo->lo_name != tag) { 1817 #endif 1818 lo->lo_name = tag; 1819 #ifdef WITNESS 1820 WITNESS_DESTROY(lo); 1821 WITNESS_INIT(lo, tag); 1822 } 1823 #endif 1824 /* 1825 * By default, don't allow shared locks unless filesystems opt-in. 1826 */ 1827 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1828 /* 1829 * Finalize various vnode identity bits. 1830 */ 1831 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1832 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1833 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1834 vp->v_type = VNON; 1835 vp->v_op = vops; 1836 vp->v_irflag = 0; 1837 v_init_counters(vp); 1838 vn_seqc_init(vp); 1839 vp->v_bufobj.bo_ops = &buf_ops_bio; 1840 #ifdef DIAGNOSTIC 1841 if (mp == NULL && vops != &dead_vnodeops) 1842 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 1843 #endif 1844 #ifdef MAC 1845 mac_vnode_init(vp); 1846 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1847 mac_vnode_associate_singlelabel(mp, vp); 1848 #endif 1849 if (mp != NULL) { 1850 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1851 } 1852 1853 /* 1854 * For the filesystems which do not use vfs_hash_insert(), 1855 * still initialize v_hash to have vfs_hash_index() useful. 1856 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1857 * its own hashing. 1858 */ 1859 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1860 1861 *vpp = vp; 1862 return (0); 1863 } 1864 1865 void 1866 getnewvnode_reserve(void) 1867 { 1868 struct thread *td; 1869 1870 td = curthread; 1871 MPASS(td->td_vp_reserved == NULL); 1872 td->td_vp_reserved = vn_alloc(NULL); 1873 } 1874 1875 void 1876 getnewvnode_drop_reserve(void) 1877 { 1878 struct thread *td; 1879 1880 td = curthread; 1881 if (td->td_vp_reserved != NULL) { 1882 vn_free(td->td_vp_reserved); 1883 td->td_vp_reserved = NULL; 1884 } 1885 } 1886 1887 static void __noinline 1888 freevnode(struct vnode *vp) 1889 { 1890 struct bufobj *bo; 1891 1892 /* 1893 * The vnode has been marked for destruction, so free it. 1894 * 1895 * The vnode will be returned to the zone where it will 1896 * normally remain until it is needed for another vnode. We 1897 * need to cleanup (or verify that the cleanup has already 1898 * been done) any residual data left from its current use 1899 * so as not to contaminate the freshly allocated vnode. 1900 */ 1901 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 1902 /* 1903 * Paired with vgone. 1904 */ 1905 vn_seqc_write_end_free(vp); 1906 1907 bo = &vp->v_bufobj; 1908 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 1909 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 1910 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 1911 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 1912 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 1913 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 1914 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 1915 ("clean blk trie not empty")); 1916 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 1917 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 1918 ("dirty blk trie not empty")); 1919 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 1920 ("Dangling rangelock waiters")); 1921 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, 1922 ("Leaked inactivation")); 1923 VI_UNLOCK(vp); 1924 cache_assert_no_entries(vp); 1925 1926 #ifdef MAC 1927 mac_vnode_destroy(vp); 1928 #endif 1929 if (vp->v_pollinfo != NULL) { 1930 /* 1931 * Use LK_NOWAIT to shut up witness about the lock. We may get 1932 * here while having another vnode locked when trying to 1933 * satisfy a lookup and needing to recycle. 1934 */ 1935 VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); 1936 destroy_vpollinfo(vp->v_pollinfo); 1937 VOP_UNLOCK(vp); 1938 vp->v_pollinfo = NULL; 1939 } 1940 vp->v_mountedhere = NULL; 1941 vp->v_unpcb = NULL; 1942 vp->v_rdev = NULL; 1943 vp->v_fifoinfo = NULL; 1944 vp->v_iflag = 0; 1945 vp->v_vflag = 0; 1946 bo->bo_flag = 0; 1947 vn_free(vp); 1948 } 1949 1950 /* 1951 * Delete from old mount point vnode list, if on one. 1952 */ 1953 static void 1954 delmntque(struct vnode *vp) 1955 { 1956 struct mount *mp; 1957 1958 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 1959 1960 mp = vp->v_mount; 1961 MNT_ILOCK(mp); 1962 VI_LOCK(vp); 1963 vp->v_mount = NULL; 1964 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1965 ("bad mount point vnode list size")); 1966 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1967 mp->mnt_nvnodelistsize--; 1968 MNT_REL(mp); 1969 MNT_IUNLOCK(mp); 1970 /* 1971 * The caller expects the interlock to be still held. 1972 */ 1973 ASSERT_VI_LOCKED(vp, __func__); 1974 } 1975 1976 static int 1977 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) 1978 { 1979 1980 KASSERT(vp->v_mount == NULL, 1981 ("insmntque: vnode already on per mount vnode list")); 1982 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1983 if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { 1984 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1985 } else { 1986 KASSERT(!dtr, 1987 ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", 1988 __func__)); 1989 } 1990 1991 /* 1992 * We acquire the vnode interlock early to ensure that the 1993 * vnode cannot be recycled by another process releasing a 1994 * holdcnt on it before we get it on both the vnode list 1995 * and the active vnode list. The mount mutex protects only 1996 * manipulation of the vnode list and the vnode freelist 1997 * mutex protects only manipulation of the active vnode list. 1998 * Hence the need to hold the vnode interlock throughout. 1999 */ 2000 MNT_ILOCK(mp); 2001 VI_LOCK(vp); 2002 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 2003 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 2004 mp->mnt_nvnodelistsize == 0)) && 2005 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 2006 VI_UNLOCK(vp); 2007 MNT_IUNLOCK(mp); 2008 if (dtr) { 2009 vp->v_data = NULL; 2010 vp->v_op = &dead_vnodeops; 2011 vgone(vp); 2012 vput(vp); 2013 } 2014 return (EBUSY); 2015 } 2016 vp->v_mount = mp; 2017 MNT_REF(mp); 2018 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2019 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 2020 ("neg mount point vnode list size")); 2021 mp->mnt_nvnodelistsize++; 2022 VI_UNLOCK(vp); 2023 MNT_IUNLOCK(mp); 2024 return (0); 2025 } 2026 2027 /* 2028 * Insert into list of vnodes for the new mount point, if available. 2029 * insmntque() reclaims the vnode on insertion failure, insmntque1() 2030 * leaves handling of the vnode to the caller. 2031 */ 2032 int 2033 insmntque(struct vnode *vp, struct mount *mp) 2034 { 2035 return (insmntque1_int(vp, mp, true)); 2036 } 2037 2038 int 2039 insmntque1(struct vnode *vp, struct mount *mp) 2040 { 2041 return (insmntque1_int(vp, mp, false)); 2042 } 2043 2044 /* 2045 * Flush out and invalidate all buffers associated with a bufobj 2046 * Called with the underlying object locked. 2047 */ 2048 int 2049 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 2050 { 2051 int error; 2052 2053 BO_LOCK(bo); 2054 if (flags & V_SAVE) { 2055 error = bufobj_wwait(bo, slpflag, slptimeo); 2056 if (error) { 2057 BO_UNLOCK(bo); 2058 return (error); 2059 } 2060 if (bo->bo_dirty.bv_cnt > 0) { 2061 BO_UNLOCK(bo); 2062 do { 2063 error = BO_SYNC(bo, MNT_WAIT); 2064 } while (error == ERELOOKUP); 2065 if (error != 0) 2066 return (error); 2067 BO_LOCK(bo); 2068 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { 2069 BO_UNLOCK(bo); 2070 return (EBUSY); 2071 } 2072 } 2073 } 2074 /* 2075 * If you alter this loop please notice that interlock is dropped and 2076 * reacquired in flushbuflist. Special care is needed to ensure that 2077 * no race conditions occur from this. 2078 */ 2079 do { 2080 error = flushbuflist(&bo->bo_clean, 2081 flags, bo, slpflag, slptimeo); 2082 if (error == 0 && !(flags & V_CLEANONLY)) 2083 error = flushbuflist(&bo->bo_dirty, 2084 flags, bo, slpflag, slptimeo); 2085 if (error != 0 && error != EAGAIN) { 2086 BO_UNLOCK(bo); 2087 return (error); 2088 } 2089 } while (error != 0); 2090 2091 /* 2092 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 2093 * have write I/O in-progress but if there is a VM object then the 2094 * VM object can also have read-I/O in-progress. 2095 */ 2096 do { 2097 bufobj_wwait(bo, 0, 0); 2098 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 2099 BO_UNLOCK(bo); 2100 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 2101 BO_LOCK(bo); 2102 } 2103 } while (bo->bo_numoutput > 0); 2104 BO_UNLOCK(bo); 2105 2106 /* 2107 * Destroy the copy in the VM cache, too. 2108 */ 2109 if (bo->bo_object != NULL && 2110 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 2111 VM_OBJECT_WLOCK(bo->bo_object); 2112 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 2113 OBJPR_CLEANONLY : 0); 2114 VM_OBJECT_WUNLOCK(bo->bo_object); 2115 } 2116 2117 #ifdef INVARIANTS 2118 BO_LOCK(bo); 2119 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 2120 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 2121 bo->bo_clean.bv_cnt > 0)) 2122 panic("vinvalbuf: flush failed"); 2123 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 2124 bo->bo_dirty.bv_cnt > 0) 2125 panic("vinvalbuf: flush dirty failed"); 2126 BO_UNLOCK(bo); 2127 #endif 2128 return (0); 2129 } 2130 2131 /* 2132 * Flush out and invalidate all buffers associated with a vnode. 2133 * Called with the underlying object locked. 2134 */ 2135 int 2136 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 2137 { 2138 2139 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2140 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 2141 if (vp->v_object != NULL && vp->v_object->handle != vp) 2142 return (0); 2143 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 2144 } 2145 2146 /* 2147 * Flush out buffers on the specified list. 2148 * 2149 */ 2150 static int 2151 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 2152 int slptimeo) 2153 { 2154 struct buf *bp, *nbp; 2155 int retval, error; 2156 daddr_t lblkno; 2157 b_xflags_t xflags; 2158 2159 ASSERT_BO_WLOCKED(bo); 2160 2161 retval = 0; 2162 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2163 /* 2164 * If we are flushing both V_NORMAL and V_ALT buffers then 2165 * do not skip any buffers. If we are flushing only V_NORMAL 2166 * buffers then skip buffers marked as BX_ALTDATA. If we are 2167 * flushing only V_ALT buffers then skip buffers not marked 2168 * as BX_ALTDATA. 2169 */ 2170 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2171 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2172 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2173 continue; 2174 } 2175 if (nbp != NULL) { 2176 lblkno = nbp->b_lblkno; 2177 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2178 } 2179 retval = EAGAIN; 2180 error = BUF_TIMELOCK(bp, 2181 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2182 "flushbuf", slpflag, slptimeo); 2183 if (error) { 2184 BO_LOCK(bo); 2185 return (error != ENOLCK ? error : EAGAIN); 2186 } 2187 KASSERT(bp->b_bufobj == bo, 2188 ("bp %p wrong b_bufobj %p should be %p", 2189 bp, bp->b_bufobj, bo)); 2190 /* 2191 * XXX Since there are no node locks for NFS, I 2192 * believe there is a slight chance that a delayed 2193 * write will occur while sleeping just above, so 2194 * check for it. 2195 */ 2196 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2197 (flags & V_SAVE)) { 2198 bremfree(bp); 2199 bp->b_flags |= B_ASYNC; 2200 bwrite(bp); 2201 BO_LOCK(bo); 2202 return (EAGAIN); /* XXX: why not loop ? */ 2203 } 2204 bremfree(bp); 2205 bp->b_flags |= (B_INVAL | B_RELBUF); 2206 bp->b_flags &= ~B_ASYNC; 2207 brelse(bp); 2208 BO_LOCK(bo); 2209 if (nbp == NULL) 2210 break; 2211 nbp = gbincore(bo, lblkno); 2212 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2213 != xflags) 2214 break; /* nbp invalid */ 2215 } 2216 return (retval); 2217 } 2218 2219 int 2220 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2221 { 2222 struct buf *bp; 2223 int error; 2224 daddr_t lblkno; 2225 2226 ASSERT_BO_LOCKED(bo); 2227 2228 for (lblkno = startn;;) { 2229 again: 2230 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 2231 if (bp == NULL || bp->b_lblkno >= endn || 2232 bp->b_lblkno < startn) 2233 break; 2234 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2235 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2236 if (error != 0) { 2237 BO_RLOCK(bo); 2238 if (error == ENOLCK) 2239 goto again; 2240 return (error); 2241 } 2242 KASSERT(bp->b_bufobj == bo, 2243 ("bp %p wrong b_bufobj %p should be %p", 2244 bp, bp->b_bufobj, bo)); 2245 lblkno = bp->b_lblkno + 1; 2246 if ((bp->b_flags & B_MANAGED) == 0) 2247 bremfree(bp); 2248 bp->b_flags |= B_RELBUF; 2249 /* 2250 * In the VMIO case, use the B_NOREUSE flag to hint that the 2251 * pages backing each buffer in the range are unlikely to be 2252 * reused. Dirty buffers will have the hint applied once 2253 * they've been written. 2254 */ 2255 if ((bp->b_flags & B_VMIO) != 0) 2256 bp->b_flags |= B_NOREUSE; 2257 brelse(bp); 2258 BO_RLOCK(bo); 2259 } 2260 return (0); 2261 } 2262 2263 /* 2264 * Truncate a file's buffer and pages to a specified length. This 2265 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2266 * sync activity. 2267 */ 2268 int 2269 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2270 { 2271 struct buf *bp, *nbp; 2272 struct bufobj *bo; 2273 daddr_t startlbn; 2274 2275 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2276 vp, blksize, (uintmax_t)length); 2277 2278 /* 2279 * Round up to the *next* lbn. 2280 */ 2281 startlbn = howmany(length, blksize); 2282 2283 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2284 2285 bo = &vp->v_bufobj; 2286 restart_unlocked: 2287 BO_LOCK(bo); 2288 2289 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2290 ; 2291 2292 if (length > 0) { 2293 restartsync: 2294 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2295 if (bp->b_lblkno > 0) 2296 continue; 2297 /* 2298 * Since we hold the vnode lock this should only 2299 * fail if we're racing with the buf daemon. 2300 */ 2301 if (BUF_LOCK(bp, 2302 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2303 BO_LOCKPTR(bo)) == ENOLCK) 2304 goto restart_unlocked; 2305 2306 VNASSERT((bp->b_flags & B_DELWRI), vp, 2307 ("buf(%p) on dirty queue without DELWRI", bp)); 2308 2309 bremfree(bp); 2310 bawrite(bp); 2311 BO_LOCK(bo); 2312 goto restartsync; 2313 } 2314 } 2315 2316 bufobj_wwait(bo, 0, 0); 2317 BO_UNLOCK(bo); 2318 vnode_pager_setsize(vp, length); 2319 2320 return (0); 2321 } 2322 2323 /* 2324 * Invalidate the cached pages of a file's buffer within the range of block 2325 * numbers [startlbn, endlbn). 2326 */ 2327 void 2328 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2329 int blksize) 2330 { 2331 struct bufobj *bo; 2332 off_t start, end; 2333 2334 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2335 2336 start = blksize * startlbn; 2337 end = blksize * endlbn; 2338 2339 bo = &vp->v_bufobj; 2340 BO_LOCK(bo); 2341 MPASS(blksize == bo->bo_bsize); 2342 2343 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2344 ; 2345 2346 BO_UNLOCK(bo); 2347 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2348 } 2349 2350 static int 2351 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2352 daddr_t startlbn, daddr_t endlbn) 2353 { 2354 struct buf *bp, *nbp; 2355 bool anyfreed; 2356 2357 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2358 ASSERT_BO_LOCKED(bo); 2359 2360 do { 2361 anyfreed = false; 2362 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 2363 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2364 continue; 2365 if (BUF_LOCK(bp, 2366 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2367 BO_LOCKPTR(bo)) == ENOLCK) { 2368 BO_LOCK(bo); 2369 return (EAGAIN); 2370 } 2371 2372 bremfree(bp); 2373 bp->b_flags |= B_INVAL | B_RELBUF; 2374 bp->b_flags &= ~B_ASYNC; 2375 brelse(bp); 2376 anyfreed = true; 2377 2378 BO_LOCK(bo); 2379 if (nbp != NULL && 2380 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 2381 nbp->b_vp != vp || 2382 (nbp->b_flags & B_DELWRI) != 0)) 2383 return (EAGAIN); 2384 } 2385 2386 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2387 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2388 continue; 2389 if (BUF_LOCK(bp, 2390 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2391 BO_LOCKPTR(bo)) == ENOLCK) { 2392 BO_LOCK(bo); 2393 return (EAGAIN); 2394 } 2395 bremfree(bp); 2396 bp->b_flags |= B_INVAL | B_RELBUF; 2397 bp->b_flags &= ~B_ASYNC; 2398 brelse(bp); 2399 anyfreed = true; 2400 2401 BO_LOCK(bo); 2402 if (nbp != NULL && 2403 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 2404 (nbp->b_vp != vp) || 2405 (nbp->b_flags & B_DELWRI) == 0)) 2406 return (EAGAIN); 2407 } 2408 } while (anyfreed); 2409 return (0); 2410 } 2411 2412 static void 2413 buf_vlist_remove(struct buf *bp) 2414 { 2415 struct bufv *bv; 2416 b_xflags_t flags; 2417 2418 flags = bp->b_xflags; 2419 2420 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2421 ASSERT_BO_WLOCKED(bp->b_bufobj); 2422 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2423 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2424 ("%s: buffer %p has invalid queue state", __func__, bp)); 2425 2426 if ((flags & BX_VNDIRTY) != 0) 2427 bv = &bp->b_bufobj->bo_dirty; 2428 else 2429 bv = &bp->b_bufobj->bo_clean; 2430 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2431 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2432 bv->bv_cnt--; 2433 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2434 } 2435 2436 /* 2437 * Add the buffer to the sorted clean or dirty block list. 2438 * 2439 * NOTE: xflags is passed as a constant, optimizing this inline function! 2440 */ 2441 static void 2442 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2443 { 2444 struct bufv *bv; 2445 struct buf *n; 2446 int error; 2447 2448 ASSERT_BO_WLOCKED(bo); 2449 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2450 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2451 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2452 ("dead bo %p", bo)); 2453 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2454 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2455 bp->b_xflags |= xflags; 2456 if (xflags & BX_VNDIRTY) 2457 bv = &bo->bo_dirty; 2458 else 2459 bv = &bo->bo_clean; 2460 2461 /* 2462 * Keep the list ordered. Optimize empty list insertion. Assume 2463 * we tend to grow at the tail so lookup_le should usually be cheaper 2464 * than _ge. 2465 */ 2466 if (bv->bv_cnt == 0 || 2467 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2468 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2469 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2470 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2471 else 2472 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2473 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2474 if (error) 2475 panic("buf_vlist_add: Preallocated nodes insufficient."); 2476 bv->bv_cnt++; 2477 } 2478 2479 /* 2480 * Look up a buffer using the buffer tries. 2481 */ 2482 struct buf * 2483 gbincore(struct bufobj *bo, daddr_t lblkno) 2484 { 2485 struct buf *bp; 2486 2487 ASSERT_BO_LOCKED(bo); 2488 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2489 if (bp != NULL) 2490 return (bp); 2491 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2492 } 2493 2494 /* 2495 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2496 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2497 * stability of the result. Like other lockless lookups, the found buf may 2498 * already be invalid by the time this function returns. 2499 */ 2500 struct buf * 2501 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2502 { 2503 struct buf *bp; 2504 2505 ASSERT_BO_UNLOCKED(bo); 2506 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2507 if (bp != NULL) 2508 return (bp); 2509 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2510 } 2511 2512 /* 2513 * Associate a buffer with a vnode. 2514 */ 2515 void 2516 bgetvp(struct vnode *vp, struct buf *bp) 2517 { 2518 struct bufobj *bo; 2519 2520 bo = &vp->v_bufobj; 2521 ASSERT_BO_WLOCKED(bo); 2522 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2523 2524 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2525 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2526 ("bgetvp: bp already attached! %p", bp)); 2527 2528 vhold(vp); 2529 bp->b_vp = vp; 2530 bp->b_bufobj = bo; 2531 /* 2532 * Insert onto list for new vnode. 2533 */ 2534 buf_vlist_add(bp, bo, BX_VNCLEAN); 2535 } 2536 2537 /* 2538 * Disassociate a buffer from a vnode. 2539 */ 2540 void 2541 brelvp(struct buf *bp) 2542 { 2543 struct bufobj *bo; 2544 struct vnode *vp; 2545 2546 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2547 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2548 2549 /* 2550 * Delete from old vnode list, if on one. 2551 */ 2552 vp = bp->b_vp; /* XXX */ 2553 bo = bp->b_bufobj; 2554 BO_LOCK(bo); 2555 buf_vlist_remove(bp); 2556 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2557 bo->bo_flag &= ~BO_ONWORKLST; 2558 mtx_lock(&sync_mtx); 2559 LIST_REMOVE(bo, bo_synclist); 2560 syncer_worklist_len--; 2561 mtx_unlock(&sync_mtx); 2562 } 2563 bp->b_vp = NULL; 2564 bp->b_bufobj = NULL; 2565 BO_UNLOCK(bo); 2566 vdrop(vp); 2567 } 2568 2569 /* 2570 * Add an item to the syncer work queue. 2571 */ 2572 static void 2573 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2574 { 2575 int slot; 2576 2577 ASSERT_BO_WLOCKED(bo); 2578 2579 mtx_lock(&sync_mtx); 2580 if (bo->bo_flag & BO_ONWORKLST) 2581 LIST_REMOVE(bo, bo_synclist); 2582 else { 2583 bo->bo_flag |= BO_ONWORKLST; 2584 syncer_worklist_len++; 2585 } 2586 2587 if (delay > syncer_maxdelay - 2) 2588 delay = syncer_maxdelay - 2; 2589 slot = (syncer_delayno + delay) & syncer_mask; 2590 2591 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2592 mtx_unlock(&sync_mtx); 2593 } 2594 2595 static int 2596 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2597 { 2598 int error, len; 2599 2600 mtx_lock(&sync_mtx); 2601 len = syncer_worklist_len - sync_vnode_count; 2602 mtx_unlock(&sync_mtx); 2603 error = SYSCTL_OUT(req, &len, sizeof(len)); 2604 return (error); 2605 } 2606 2607 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2608 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2609 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2610 2611 static struct proc *updateproc; 2612 static void sched_sync(void); 2613 static struct kproc_desc up_kp = { 2614 "syncer", 2615 sched_sync, 2616 &updateproc 2617 }; 2618 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2619 2620 static int 2621 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2622 { 2623 struct vnode *vp; 2624 struct mount *mp; 2625 2626 *bo = LIST_FIRST(slp); 2627 if (*bo == NULL) 2628 return (0); 2629 vp = bo2vnode(*bo); 2630 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2631 return (1); 2632 /* 2633 * We use vhold in case the vnode does not 2634 * successfully sync. vhold prevents the vnode from 2635 * going away when we unlock the sync_mtx so that 2636 * we can acquire the vnode interlock. 2637 */ 2638 vholdl(vp); 2639 mtx_unlock(&sync_mtx); 2640 VI_UNLOCK(vp); 2641 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2642 vdrop(vp); 2643 mtx_lock(&sync_mtx); 2644 return (*bo == LIST_FIRST(slp)); 2645 } 2646 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2647 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2648 VOP_UNLOCK(vp); 2649 vn_finished_write(mp); 2650 BO_LOCK(*bo); 2651 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2652 /* 2653 * Put us back on the worklist. The worklist 2654 * routine will remove us from our current 2655 * position and then add us back in at a later 2656 * position. 2657 */ 2658 vn_syncer_add_to_worklist(*bo, syncdelay); 2659 } 2660 BO_UNLOCK(*bo); 2661 vdrop(vp); 2662 mtx_lock(&sync_mtx); 2663 return (0); 2664 } 2665 2666 static int first_printf = 1; 2667 2668 /* 2669 * System filesystem synchronizer daemon. 2670 */ 2671 static void 2672 sched_sync(void) 2673 { 2674 struct synclist *next, *slp; 2675 struct bufobj *bo; 2676 long starttime; 2677 struct thread *td = curthread; 2678 int last_work_seen; 2679 int net_worklist_len; 2680 int syncer_final_iter; 2681 int error; 2682 2683 last_work_seen = 0; 2684 syncer_final_iter = 0; 2685 syncer_state = SYNCER_RUNNING; 2686 starttime = time_uptime; 2687 td->td_pflags |= TDP_NORUNNINGBUF; 2688 2689 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2690 SHUTDOWN_PRI_LAST); 2691 2692 mtx_lock(&sync_mtx); 2693 for (;;) { 2694 if (syncer_state == SYNCER_FINAL_DELAY && 2695 syncer_final_iter == 0) { 2696 mtx_unlock(&sync_mtx); 2697 kproc_suspend_check(td->td_proc); 2698 mtx_lock(&sync_mtx); 2699 } 2700 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2701 if (syncer_state != SYNCER_RUNNING && 2702 starttime != time_uptime) { 2703 if (first_printf) { 2704 printf("\nSyncing disks, vnodes remaining... "); 2705 first_printf = 0; 2706 } 2707 printf("%d ", net_worklist_len); 2708 } 2709 starttime = time_uptime; 2710 2711 /* 2712 * Push files whose dirty time has expired. Be careful 2713 * of interrupt race on slp queue. 2714 * 2715 * Skip over empty worklist slots when shutting down. 2716 */ 2717 do { 2718 slp = &syncer_workitem_pending[syncer_delayno]; 2719 syncer_delayno += 1; 2720 if (syncer_delayno == syncer_maxdelay) 2721 syncer_delayno = 0; 2722 next = &syncer_workitem_pending[syncer_delayno]; 2723 /* 2724 * If the worklist has wrapped since the 2725 * it was emptied of all but syncer vnodes, 2726 * switch to the FINAL_DELAY state and run 2727 * for one more second. 2728 */ 2729 if (syncer_state == SYNCER_SHUTTING_DOWN && 2730 net_worklist_len == 0 && 2731 last_work_seen == syncer_delayno) { 2732 syncer_state = SYNCER_FINAL_DELAY; 2733 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2734 } 2735 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2736 syncer_worklist_len > 0); 2737 2738 /* 2739 * Keep track of the last time there was anything 2740 * on the worklist other than syncer vnodes. 2741 * Return to the SHUTTING_DOWN state if any 2742 * new work appears. 2743 */ 2744 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2745 last_work_seen = syncer_delayno; 2746 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2747 syncer_state = SYNCER_SHUTTING_DOWN; 2748 while (!LIST_EMPTY(slp)) { 2749 error = sync_vnode(slp, &bo, td); 2750 if (error == 1) { 2751 LIST_REMOVE(bo, bo_synclist); 2752 LIST_INSERT_HEAD(next, bo, bo_synclist); 2753 continue; 2754 } 2755 2756 if (first_printf == 0) { 2757 /* 2758 * Drop the sync mutex, because some watchdog 2759 * drivers need to sleep while patting 2760 */ 2761 mtx_unlock(&sync_mtx); 2762 wdog_kern_pat(WD_LASTVAL); 2763 mtx_lock(&sync_mtx); 2764 } 2765 } 2766 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2767 syncer_final_iter--; 2768 /* 2769 * The variable rushjob allows the kernel to speed up the 2770 * processing of the filesystem syncer process. A rushjob 2771 * value of N tells the filesystem syncer to process the next 2772 * N seconds worth of work on its queue ASAP. Currently rushjob 2773 * is used by the soft update code to speed up the filesystem 2774 * syncer process when the incore state is getting so far 2775 * ahead of the disk that the kernel memory pool is being 2776 * threatened with exhaustion. 2777 */ 2778 if (rushjob > 0) { 2779 rushjob -= 1; 2780 continue; 2781 } 2782 /* 2783 * Just sleep for a short period of time between 2784 * iterations when shutting down to allow some I/O 2785 * to happen. 2786 * 2787 * If it has taken us less than a second to process the 2788 * current work, then wait. Otherwise start right over 2789 * again. We can still lose time if any single round 2790 * takes more than two seconds, but it does not really 2791 * matter as we are just trying to generally pace the 2792 * filesystem activity. 2793 */ 2794 if (syncer_state != SYNCER_RUNNING || 2795 time_uptime == starttime) { 2796 thread_lock(td); 2797 sched_prio(td, PPAUSE); 2798 thread_unlock(td); 2799 } 2800 if (syncer_state != SYNCER_RUNNING) 2801 cv_timedwait(&sync_wakeup, &sync_mtx, 2802 hz / SYNCER_SHUTDOWN_SPEEDUP); 2803 else if (time_uptime == starttime) 2804 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2805 } 2806 } 2807 2808 /* 2809 * Request the syncer daemon to speed up its work. 2810 * We never push it to speed up more than half of its 2811 * normal turn time, otherwise it could take over the cpu. 2812 */ 2813 int 2814 speedup_syncer(void) 2815 { 2816 int ret = 0; 2817 2818 mtx_lock(&sync_mtx); 2819 if (rushjob < syncdelay / 2) { 2820 rushjob += 1; 2821 stat_rush_requests += 1; 2822 ret = 1; 2823 } 2824 mtx_unlock(&sync_mtx); 2825 cv_broadcast(&sync_wakeup); 2826 return (ret); 2827 } 2828 2829 /* 2830 * Tell the syncer to speed up its work and run though its work 2831 * list several times, then tell it to shut down. 2832 */ 2833 static void 2834 syncer_shutdown(void *arg, int howto) 2835 { 2836 2837 if (howto & RB_NOSYNC) 2838 return; 2839 mtx_lock(&sync_mtx); 2840 syncer_state = SYNCER_SHUTTING_DOWN; 2841 rushjob = 0; 2842 mtx_unlock(&sync_mtx); 2843 cv_broadcast(&sync_wakeup); 2844 kproc_shutdown(arg, howto); 2845 } 2846 2847 void 2848 syncer_suspend(void) 2849 { 2850 2851 syncer_shutdown(updateproc, 0); 2852 } 2853 2854 void 2855 syncer_resume(void) 2856 { 2857 2858 mtx_lock(&sync_mtx); 2859 first_printf = 1; 2860 syncer_state = SYNCER_RUNNING; 2861 mtx_unlock(&sync_mtx); 2862 cv_broadcast(&sync_wakeup); 2863 kproc_resume(updateproc); 2864 } 2865 2866 /* 2867 * Move the buffer between the clean and dirty lists of its vnode. 2868 */ 2869 void 2870 reassignbuf(struct buf *bp) 2871 { 2872 struct vnode *vp; 2873 struct bufobj *bo; 2874 int delay; 2875 #ifdef INVARIANTS 2876 struct bufv *bv; 2877 #endif 2878 2879 vp = bp->b_vp; 2880 bo = bp->b_bufobj; 2881 2882 KASSERT((bp->b_flags & B_PAGING) == 0, 2883 ("%s: cannot reassign paging buffer %p", __func__, bp)); 2884 2885 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2886 bp, bp->b_vp, bp->b_flags); 2887 2888 BO_LOCK(bo); 2889 buf_vlist_remove(bp); 2890 2891 /* 2892 * If dirty, put on list of dirty buffers; otherwise insert onto list 2893 * of clean buffers. 2894 */ 2895 if (bp->b_flags & B_DELWRI) { 2896 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2897 switch (vp->v_type) { 2898 case VDIR: 2899 delay = dirdelay; 2900 break; 2901 case VCHR: 2902 delay = metadelay; 2903 break; 2904 default: 2905 delay = filedelay; 2906 } 2907 vn_syncer_add_to_worklist(bo, delay); 2908 } 2909 buf_vlist_add(bp, bo, BX_VNDIRTY); 2910 } else { 2911 buf_vlist_add(bp, bo, BX_VNCLEAN); 2912 2913 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2914 mtx_lock(&sync_mtx); 2915 LIST_REMOVE(bo, bo_synclist); 2916 syncer_worklist_len--; 2917 mtx_unlock(&sync_mtx); 2918 bo->bo_flag &= ~BO_ONWORKLST; 2919 } 2920 } 2921 #ifdef INVARIANTS 2922 bv = &bo->bo_clean; 2923 bp = TAILQ_FIRST(&bv->bv_hd); 2924 KASSERT(bp == NULL || bp->b_bufobj == bo, 2925 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2926 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2927 KASSERT(bp == NULL || bp->b_bufobj == bo, 2928 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2929 bv = &bo->bo_dirty; 2930 bp = TAILQ_FIRST(&bv->bv_hd); 2931 KASSERT(bp == NULL || bp->b_bufobj == bo, 2932 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2933 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2934 KASSERT(bp == NULL || bp->b_bufobj == bo, 2935 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2936 #endif 2937 BO_UNLOCK(bo); 2938 } 2939 2940 static void 2941 v_init_counters(struct vnode *vp) 2942 { 2943 2944 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2945 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2946 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2947 2948 refcount_init(&vp->v_holdcnt, 1); 2949 refcount_init(&vp->v_usecount, 1); 2950 } 2951 2952 /* 2953 * Grab a particular vnode from the free list, increment its 2954 * reference count and lock it. VIRF_DOOMED is set if the vnode 2955 * is being destroyed. Only callers who specify LK_RETRY will 2956 * see doomed vnodes. If inactive processing was delayed in 2957 * vput try to do it here. 2958 * 2959 * usecount is manipulated using atomics without holding any locks. 2960 * 2961 * holdcnt can be manipulated using atomics without holding any locks, 2962 * except when transitioning 1<->0, in which case the interlock is held. 2963 * 2964 * Consumers which don't guarantee liveness of the vnode can use SMR to 2965 * try to get a reference. Note this operation can fail since the vnode 2966 * may be awaiting getting freed by the time they get to it. 2967 */ 2968 enum vgetstate 2969 vget_prep_smr(struct vnode *vp) 2970 { 2971 enum vgetstate vs; 2972 2973 VFS_SMR_ASSERT_ENTERED(); 2974 2975 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2976 vs = VGET_USECOUNT; 2977 } else { 2978 if (vhold_smr(vp)) 2979 vs = VGET_HOLDCNT; 2980 else 2981 vs = VGET_NONE; 2982 } 2983 return (vs); 2984 } 2985 2986 enum vgetstate 2987 vget_prep(struct vnode *vp) 2988 { 2989 enum vgetstate vs; 2990 2991 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2992 vs = VGET_USECOUNT; 2993 } else { 2994 vhold(vp); 2995 vs = VGET_HOLDCNT; 2996 } 2997 return (vs); 2998 } 2999 3000 void 3001 vget_abort(struct vnode *vp, enum vgetstate vs) 3002 { 3003 3004 switch (vs) { 3005 case VGET_USECOUNT: 3006 vrele(vp); 3007 break; 3008 case VGET_HOLDCNT: 3009 vdrop(vp); 3010 break; 3011 default: 3012 __assert_unreachable(); 3013 } 3014 } 3015 3016 int 3017 vget(struct vnode *vp, int flags) 3018 { 3019 enum vgetstate vs; 3020 3021 vs = vget_prep(vp); 3022 return (vget_finish(vp, flags, vs)); 3023 } 3024 3025 int 3026 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 3027 { 3028 int error; 3029 3030 if ((flags & LK_INTERLOCK) != 0) 3031 ASSERT_VI_LOCKED(vp, __func__); 3032 else 3033 ASSERT_VI_UNLOCKED(vp, __func__); 3034 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3035 VNPASS(vp->v_holdcnt > 0, vp); 3036 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3037 3038 error = vn_lock(vp, flags); 3039 if (__predict_false(error != 0)) { 3040 vget_abort(vp, vs); 3041 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 3042 vp); 3043 return (error); 3044 } 3045 3046 vget_finish_ref(vp, vs); 3047 return (0); 3048 } 3049 3050 void 3051 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 3052 { 3053 int old; 3054 3055 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3056 VNPASS(vp->v_holdcnt > 0, vp); 3057 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3058 3059 if (vs == VGET_USECOUNT) 3060 return; 3061 3062 /* 3063 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3064 * the vnode around. Otherwise someone else lended their hold count and 3065 * we have to drop ours. 3066 */ 3067 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3068 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3069 if (old != 0) { 3070 #ifdef INVARIANTS 3071 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3072 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3073 #else 3074 refcount_release(&vp->v_holdcnt); 3075 #endif 3076 } 3077 } 3078 3079 void 3080 vref(struct vnode *vp) 3081 { 3082 enum vgetstate vs; 3083 3084 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3085 vs = vget_prep(vp); 3086 vget_finish_ref(vp, vs); 3087 } 3088 3089 void 3090 vrefact(struct vnode *vp) 3091 { 3092 3093 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3094 #ifdef INVARIANTS 3095 int old = atomic_fetchadd_int(&vp->v_usecount, 1); 3096 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3097 #else 3098 refcount_acquire(&vp->v_usecount); 3099 #endif 3100 } 3101 3102 void 3103 vlazy(struct vnode *vp) 3104 { 3105 struct mount *mp; 3106 3107 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3108 3109 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3110 return; 3111 /* 3112 * We may get here for inactive routines after the vnode got doomed. 3113 */ 3114 if (VN_IS_DOOMED(vp)) 3115 return; 3116 mp = vp->v_mount; 3117 mtx_lock(&mp->mnt_listmtx); 3118 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3119 vp->v_mflag |= VMP_LAZYLIST; 3120 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3121 mp->mnt_lazyvnodelistsize++; 3122 } 3123 mtx_unlock(&mp->mnt_listmtx); 3124 } 3125 3126 static void 3127 vunlazy(struct vnode *vp) 3128 { 3129 struct mount *mp; 3130 3131 ASSERT_VI_LOCKED(vp, __func__); 3132 VNPASS(!VN_IS_DOOMED(vp), vp); 3133 3134 mp = vp->v_mount; 3135 mtx_lock(&mp->mnt_listmtx); 3136 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3137 /* 3138 * Don't remove the vnode from the lazy list if another thread 3139 * has increased the hold count. It may have re-enqueued the 3140 * vnode to the lazy list and is now responsible for its 3141 * removal. 3142 */ 3143 if (vp->v_holdcnt == 0) { 3144 vp->v_mflag &= ~VMP_LAZYLIST; 3145 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3146 mp->mnt_lazyvnodelistsize--; 3147 } 3148 mtx_unlock(&mp->mnt_listmtx); 3149 } 3150 3151 /* 3152 * This routine is only meant to be called from vgonel prior to dooming 3153 * the vnode. 3154 */ 3155 static void 3156 vunlazy_gone(struct vnode *vp) 3157 { 3158 struct mount *mp; 3159 3160 ASSERT_VOP_ELOCKED(vp, __func__); 3161 ASSERT_VI_LOCKED(vp, __func__); 3162 VNPASS(!VN_IS_DOOMED(vp), vp); 3163 3164 if (vp->v_mflag & VMP_LAZYLIST) { 3165 mp = vp->v_mount; 3166 mtx_lock(&mp->mnt_listmtx); 3167 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3168 vp->v_mflag &= ~VMP_LAZYLIST; 3169 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3170 mp->mnt_lazyvnodelistsize--; 3171 mtx_unlock(&mp->mnt_listmtx); 3172 } 3173 } 3174 3175 static void 3176 vdefer_inactive(struct vnode *vp) 3177 { 3178 3179 ASSERT_VI_LOCKED(vp, __func__); 3180 VNASSERT(vp->v_holdcnt > 0, vp, 3181 ("%s: vnode without hold count", __func__)); 3182 if (VN_IS_DOOMED(vp)) { 3183 vdropl(vp); 3184 return; 3185 } 3186 if (vp->v_iflag & VI_DEFINACT) { 3187 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 3188 vdropl(vp); 3189 return; 3190 } 3191 if (vp->v_usecount > 0) { 3192 vp->v_iflag &= ~VI_OWEINACT; 3193 vdropl(vp); 3194 return; 3195 } 3196 vlazy(vp); 3197 vp->v_iflag |= VI_DEFINACT; 3198 VI_UNLOCK(vp); 3199 counter_u64_add(deferred_inact, 1); 3200 } 3201 3202 static void 3203 vdefer_inactive_unlocked(struct vnode *vp) 3204 { 3205 3206 VI_LOCK(vp); 3207 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3208 vdropl(vp); 3209 return; 3210 } 3211 vdefer_inactive(vp); 3212 } 3213 3214 enum vput_op { VRELE, VPUT, VUNREF }; 3215 3216 /* 3217 * Handle ->v_usecount transitioning to 0. 3218 * 3219 * By releasing the last usecount we take ownership of the hold count which 3220 * provides liveness of the vnode, meaning we have to vdrop. 3221 * 3222 * For all vnodes we may need to perform inactive processing. It requires an 3223 * exclusive lock on the vnode, while it is legal to call here with only a 3224 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3225 * inactive processing gets deferred to the syncer. 3226 * 3227 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3228 * on the lock being held all the way until VOP_INACTIVE. This in particular 3229 * happens with UFS which adds half-constructed vnodes to the hash, where they 3230 * can be found by other code. 3231 */ 3232 static void 3233 vput_final(struct vnode *vp, enum vput_op func) 3234 { 3235 int error; 3236 bool want_unlock; 3237 3238 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3239 VNPASS(vp->v_holdcnt > 0, vp); 3240 3241 VI_LOCK(vp); 3242 3243 /* 3244 * By the time we got here someone else might have transitioned 3245 * the count back to > 0. 3246 */ 3247 if (vp->v_usecount > 0) 3248 goto out; 3249 3250 /* 3251 * If the vnode is doomed vgone already performed inactive processing 3252 * (if needed). 3253 */ 3254 if (VN_IS_DOOMED(vp)) 3255 goto out; 3256 3257 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3258 goto out; 3259 3260 if (vp->v_iflag & VI_DOINGINACT) 3261 goto out; 3262 3263 /* 3264 * Locking operations here will drop the interlock and possibly the 3265 * vnode lock, opening a window where the vnode can get doomed all the 3266 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3267 * perform inactive. 3268 */ 3269 vp->v_iflag |= VI_OWEINACT; 3270 want_unlock = false; 3271 error = 0; 3272 switch (func) { 3273 case VRELE: 3274 switch (VOP_ISLOCKED(vp)) { 3275 case LK_EXCLUSIVE: 3276 break; 3277 case LK_EXCLOTHER: 3278 case 0: 3279 want_unlock = true; 3280 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3281 VI_LOCK(vp); 3282 break; 3283 default: 3284 /* 3285 * The lock has at least one sharer, but we have no way 3286 * to conclude whether this is us. Play it safe and 3287 * defer processing. 3288 */ 3289 error = EAGAIN; 3290 break; 3291 } 3292 break; 3293 case VPUT: 3294 want_unlock = true; 3295 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3296 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3297 LK_NOWAIT); 3298 VI_LOCK(vp); 3299 } 3300 break; 3301 case VUNREF: 3302 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3303 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3304 VI_LOCK(vp); 3305 } 3306 break; 3307 } 3308 if (error == 0) { 3309 if (func == VUNREF) { 3310 VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, 3311 ("recursive vunref")); 3312 vp->v_vflag |= VV_UNREF; 3313 } 3314 for (;;) { 3315 error = vinactive(vp); 3316 if (want_unlock) 3317 VOP_UNLOCK(vp); 3318 if (error != ERELOOKUP || !want_unlock) 3319 break; 3320 VOP_LOCK(vp, LK_EXCLUSIVE); 3321 } 3322 if (func == VUNREF) 3323 vp->v_vflag &= ~VV_UNREF; 3324 vdropl(vp); 3325 } else { 3326 vdefer_inactive(vp); 3327 } 3328 return; 3329 out: 3330 if (func == VPUT) 3331 VOP_UNLOCK(vp); 3332 vdropl(vp); 3333 } 3334 3335 /* 3336 * Decrement ->v_usecount for a vnode. 3337 * 3338 * Releasing the last use count requires additional processing, see vput_final 3339 * above for details. 3340 * 3341 * Comment above each variant denotes lock state on entry and exit. 3342 */ 3343 3344 /* 3345 * in: any 3346 * out: same as passed in 3347 */ 3348 void 3349 vrele(struct vnode *vp) 3350 { 3351 3352 ASSERT_VI_UNLOCKED(vp, __func__); 3353 if (!refcount_release(&vp->v_usecount)) 3354 return; 3355 vput_final(vp, VRELE); 3356 } 3357 3358 /* 3359 * in: locked 3360 * out: unlocked 3361 */ 3362 void 3363 vput(struct vnode *vp) 3364 { 3365 3366 ASSERT_VOP_LOCKED(vp, __func__); 3367 ASSERT_VI_UNLOCKED(vp, __func__); 3368 if (!refcount_release(&vp->v_usecount)) { 3369 VOP_UNLOCK(vp); 3370 return; 3371 } 3372 vput_final(vp, VPUT); 3373 } 3374 3375 /* 3376 * in: locked 3377 * out: locked 3378 */ 3379 void 3380 vunref(struct vnode *vp) 3381 { 3382 3383 ASSERT_VOP_LOCKED(vp, __func__); 3384 ASSERT_VI_UNLOCKED(vp, __func__); 3385 if (!refcount_release(&vp->v_usecount)) 3386 return; 3387 vput_final(vp, VUNREF); 3388 } 3389 3390 void 3391 vhold(struct vnode *vp) 3392 { 3393 int old; 3394 3395 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3396 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3397 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3398 ("%s: wrong hold count %d", __func__, old)); 3399 if (old == 0) 3400 vfs_freevnodes_dec(); 3401 } 3402 3403 void 3404 vholdnz(struct vnode *vp) 3405 { 3406 3407 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3408 #ifdef INVARIANTS 3409 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3410 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3411 ("%s: wrong hold count %d", __func__, old)); 3412 #else 3413 atomic_add_int(&vp->v_holdcnt, 1); 3414 #endif 3415 } 3416 3417 /* 3418 * Grab a hold count unless the vnode is freed. 3419 * 3420 * Only use this routine if vfs smr is the only protection you have against 3421 * freeing the vnode. 3422 * 3423 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3424 * is not set. After the flag is set the vnode becomes immutable to anyone but 3425 * the thread which managed to set the flag. 3426 * 3427 * It may be tempting to replace the loop with: 3428 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3429 * if (count & VHOLD_NO_SMR) { 3430 * backpedal and error out; 3431 * } 3432 * 3433 * However, while this is more performant, it hinders debugging by eliminating 3434 * the previously mentioned invariant. 3435 */ 3436 bool 3437 vhold_smr(struct vnode *vp) 3438 { 3439 int count; 3440 3441 VFS_SMR_ASSERT_ENTERED(); 3442 3443 count = atomic_load_int(&vp->v_holdcnt); 3444 for (;;) { 3445 if (count & VHOLD_NO_SMR) { 3446 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3447 ("non-zero hold count with flags %d\n", count)); 3448 return (false); 3449 } 3450 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3451 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3452 if (count == 0) 3453 vfs_freevnodes_dec(); 3454 return (true); 3455 } 3456 } 3457 } 3458 3459 /* 3460 * Hold a free vnode for recycling. 3461 * 3462 * Note: vnode_init references this comment. 3463 * 3464 * Attempts to recycle only need the global vnode list lock and have no use for 3465 * SMR. 3466 * 3467 * However, vnodes get inserted into the global list before they get fully 3468 * initialized and stay there until UMA decides to free the memory. This in 3469 * particular means the target can be found before it becomes usable and after 3470 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to 3471 * VHOLD_NO_SMR. 3472 * 3473 * Note: the vnode may gain more references after we transition the count 0->1. 3474 */ 3475 static bool 3476 vhold_recycle_free(struct vnode *vp) 3477 { 3478 int count; 3479 3480 mtx_assert(&vnode_list_mtx, MA_OWNED); 3481 3482 count = atomic_load_int(&vp->v_holdcnt); 3483 for (;;) { 3484 if (count & VHOLD_NO_SMR) { 3485 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3486 ("non-zero hold count with flags %d\n", count)); 3487 return (false); 3488 } 3489 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3490 if (count > 0) { 3491 return (false); 3492 } 3493 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3494 vfs_freevnodes_dec(); 3495 return (true); 3496 } 3497 } 3498 } 3499 3500 static void __noinline 3501 vdbatch_process(struct vdbatch *vd) 3502 { 3503 struct vnode *vp; 3504 int i; 3505 3506 mtx_assert(&vd->lock, MA_OWNED); 3507 MPASS(curthread->td_pinned > 0); 3508 MPASS(vd->index == VDBATCH_SIZE); 3509 3510 mtx_lock(&vnode_list_mtx); 3511 critical_enter(); 3512 freevnodes += vd->freevnodes; 3513 for (i = 0; i < VDBATCH_SIZE; i++) { 3514 vp = vd->tab[i]; 3515 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3516 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3517 MPASS(vp->v_dbatchcpu != NOCPU); 3518 vp->v_dbatchcpu = NOCPU; 3519 } 3520 mtx_unlock(&vnode_list_mtx); 3521 vd->freevnodes = 0; 3522 bzero(vd->tab, sizeof(vd->tab)); 3523 vd->index = 0; 3524 critical_exit(); 3525 } 3526 3527 static void 3528 vdbatch_enqueue(struct vnode *vp) 3529 { 3530 struct vdbatch *vd; 3531 3532 ASSERT_VI_LOCKED(vp, __func__); 3533 VNASSERT(!VN_IS_DOOMED(vp), vp, 3534 ("%s: deferring requeue of a doomed vnode", __func__)); 3535 3536 if (vp->v_dbatchcpu != NOCPU) { 3537 VI_UNLOCK(vp); 3538 return; 3539 } 3540 3541 sched_pin(); 3542 vd = DPCPU_PTR(vd); 3543 mtx_lock(&vd->lock); 3544 MPASS(vd->index < VDBATCH_SIZE); 3545 MPASS(vd->tab[vd->index] == NULL); 3546 /* 3547 * A hack: we depend on being pinned so that we know what to put in 3548 * ->v_dbatchcpu. 3549 */ 3550 vp->v_dbatchcpu = curcpu; 3551 vd->tab[vd->index] = vp; 3552 vd->index++; 3553 VI_UNLOCK(vp); 3554 if (vd->index == VDBATCH_SIZE) 3555 vdbatch_process(vd); 3556 mtx_unlock(&vd->lock); 3557 sched_unpin(); 3558 } 3559 3560 /* 3561 * This routine must only be called for vnodes which are about to be 3562 * deallocated. Supporting dequeue for arbitrary vndoes would require 3563 * validating that the locked batch matches. 3564 */ 3565 static void 3566 vdbatch_dequeue(struct vnode *vp) 3567 { 3568 struct vdbatch *vd; 3569 int i; 3570 short cpu; 3571 3572 VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, 3573 ("%s: called for a used vnode\n", __func__)); 3574 3575 cpu = vp->v_dbatchcpu; 3576 if (cpu == NOCPU) 3577 return; 3578 3579 vd = DPCPU_ID_PTR(cpu, vd); 3580 mtx_lock(&vd->lock); 3581 for (i = 0; i < vd->index; i++) { 3582 if (vd->tab[i] != vp) 3583 continue; 3584 vp->v_dbatchcpu = NOCPU; 3585 vd->index--; 3586 vd->tab[i] = vd->tab[vd->index]; 3587 vd->tab[vd->index] = NULL; 3588 break; 3589 } 3590 mtx_unlock(&vd->lock); 3591 /* 3592 * Either we dequeued the vnode above or the target CPU beat us to it. 3593 */ 3594 MPASS(vp->v_dbatchcpu == NOCPU); 3595 } 3596 3597 /* 3598 * Drop the hold count of the vnode. If this is the last reference to 3599 * the vnode we place it on the free list unless it has been vgone'd 3600 * (marked VIRF_DOOMED) in which case we will free it. 3601 * 3602 * Because the vnode vm object keeps a hold reference on the vnode if 3603 * there is at least one resident non-cached page, the vnode cannot 3604 * leave the active list without the page cleanup done. 3605 */ 3606 static void __noinline 3607 vdropl_final(struct vnode *vp) 3608 { 3609 3610 ASSERT_VI_LOCKED(vp, __func__); 3611 VNPASS(VN_IS_DOOMED(vp), vp); 3612 /* 3613 * Set the VHOLD_NO_SMR flag. 3614 * 3615 * We may be racing against vhold_smr. If they win we can just pretend 3616 * we never got this far, they will vdrop later. 3617 */ 3618 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { 3619 vfs_freevnodes_inc(); 3620 VI_UNLOCK(vp); 3621 /* 3622 * We lost the aforementioned race. Any subsequent access is 3623 * invalid as they might have managed to vdropl on their own. 3624 */ 3625 return; 3626 } 3627 /* 3628 * Don't bump freevnodes as this one is going away. 3629 */ 3630 freevnode(vp); 3631 } 3632 3633 void 3634 vdrop(struct vnode *vp) 3635 { 3636 3637 ASSERT_VI_UNLOCKED(vp, __func__); 3638 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3639 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3640 return; 3641 VI_LOCK(vp); 3642 vdropl(vp); 3643 } 3644 3645 static void __always_inline 3646 vdropl_impl(struct vnode *vp, bool enqueue) 3647 { 3648 3649 ASSERT_VI_LOCKED(vp, __func__); 3650 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3651 if (!refcount_release(&vp->v_holdcnt)) { 3652 VI_UNLOCK(vp); 3653 return; 3654 } 3655 VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); 3656 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 3657 if (VN_IS_DOOMED(vp)) { 3658 vdropl_final(vp); 3659 return; 3660 } 3661 3662 vfs_freevnodes_inc(); 3663 if (vp->v_mflag & VMP_LAZYLIST) { 3664 vunlazy(vp); 3665 } 3666 3667 if (!enqueue) { 3668 VI_UNLOCK(vp); 3669 return; 3670 } 3671 3672 /* 3673 * Also unlocks the interlock. We can't assert on it as we 3674 * released our hold and by now the vnode might have been 3675 * freed. 3676 */ 3677 vdbatch_enqueue(vp); 3678 } 3679 3680 void 3681 vdropl(struct vnode *vp) 3682 { 3683 3684 vdropl_impl(vp, true); 3685 } 3686 3687 /* 3688 * vdrop a vnode when recycling 3689 * 3690 * This is a special case routine only to be used when recycling, differs from 3691 * regular vdrop by not requeieing the vnode on LRU. 3692 * 3693 * Consider a case where vtryrecycle continuously fails with all vnodes (due to 3694 * e.g., frozen writes on the filesystem), filling the batch and causing it to 3695 * be requeued. Then vnlru will end up revisiting the same vnodes. This is a 3696 * loop which can last for as long as writes are frozen. 3697 */ 3698 static void 3699 vdropl_recycle(struct vnode *vp) 3700 { 3701 3702 vdropl_impl(vp, false); 3703 } 3704 3705 static void 3706 vdrop_recycle(struct vnode *vp) 3707 { 3708 3709 VI_LOCK(vp); 3710 vdropl_recycle(vp); 3711 } 3712 3713 /* 3714 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3715 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3716 */ 3717 static int 3718 vinactivef(struct vnode *vp) 3719 { 3720 struct vm_object *obj; 3721 int error; 3722 3723 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3724 ASSERT_VI_LOCKED(vp, "vinactive"); 3725 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 3726 ("vinactive: recursed on VI_DOINGINACT")); 3727 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3728 vp->v_iflag |= VI_DOINGINACT; 3729 vp->v_iflag &= ~VI_OWEINACT; 3730 VI_UNLOCK(vp); 3731 /* 3732 * Before moving off the active list, we must be sure that any 3733 * modified pages are converted into the vnode's dirty 3734 * buffers, since these will no longer be checked once the 3735 * vnode is on the inactive list. 3736 * 3737 * The write-out of the dirty pages is asynchronous. At the 3738 * point that VOP_INACTIVE() is called, there could still be 3739 * pending I/O and dirty pages in the object. 3740 */ 3741 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 3742 vm_object_mightbedirty(obj)) { 3743 VM_OBJECT_WLOCK(obj); 3744 vm_object_page_clean(obj, 0, 0, 0); 3745 VM_OBJECT_WUNLOCK(obj); 3746 } 3747 error = VOP_INACTIVE(vp); 3748 VI_LOCK(vp); 3749 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 3750 ("vinactive: lost VI_DOINGINACT")); 3751 vp->v_iflag &= ~VI_DOINGINACT; 3752 return (error); 3753 } 3754 3755 int 3756 vinactive(struct vnode *vp) 3757 { 3758 3759 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3760 ASSERT_VI_LOCKED(vp, "vinactive"); 3761 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3762 3763 if ((vp->v_iflag & VI_OWEINACT) == 0) 3764 return (0); 3765 if (vp->v_iflag & VI_DOINGINACT) 3766 return (0); 3767 if (vp->v_usecount > 0) { 3768 vp->v_iflag &= ~VI_OWEINACT; 3769 return (0); 3770 } 3771 return (vinactivef(vp)); 3772 } 3773 3774 /* 3775 * Remove any vnodes in the vnode table belonging to mount point mp. 3776 * 3777 * If FORCECLOSE is not specified, there should not be any active ones, 3778 * return error if any are found (nb: this is a user error, not a 3779 * system error). If FORCECLOSE is specified, detach any active vnodes 3780 * that are found. 3781 * 3782 * If WRITECLOSE is set, only flush out regular file vnodes open for 3783 * writing. 3784 * 3785 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 3786 * 3787 * `rootrefs' specifies the base reference count for the root vnode 3788 * of this filesystem. The root vnode is considered busy if its 3789 * v_usecount exceeds this value. On a successful return, vflush(, td) 3790 * will call vrele() on the root vnode exactly rootrefs times. 3791 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 3792 * be zero. 3793 */ 3794 #ifdef DIAGNOSTIC 3795 static int busyprt = 0; /* print out busy vnodes */ 3796 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 3797 #endif 3798 3799 int 3800 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 3801 { 3802 struct vnode *vp, *mvp, *rootvp = NULL; 3803 struct vattr vattr; 3804 int busy = 0, error; 3805 3806 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 3807 rootrefs, flags); 3808 if (rootrefs > 0) { 3809 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 3810 ("vflush: bad args")); 3811 /* 3812 * Get the filesystem root vnode. We can vput() it 3813 * immediately, since with rootrefs > 0, it won't go away. 3814 */ 3815 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 3816 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 3817 __func__, error); 3818 return (error); 3819 } 3820 vput(rootvp); 3821 } 3822 loop: 3823 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3824 vholdl(vp); 3825 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 3826 if (error) { 3827 vdrop(vp); 3828 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3829 goto loop; 3830 } 3831 /* 3832 * Skip over a vnodes marked VV_SYSTEM. 3833 */ 3834 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 3835 VOP_UNLOCK(vp); 3836 vdrop(vp); 3837 continue; 3838 } 3839 /* 3840 * If WRITECLOSE is set, flush out unlinked but still open 3841 * files (even if open only for reading) and regular file 3842 * vnodes open for writing. 3843 */ 3844 if (flags & WRITECLOSE) { 3845 if (vp->v_object != NULL) { 3846 VM_OBJECT_WLOCK(vp->v_object); 3847 vm_object_page_clean(vp->v_object, 0, 0, 0); 3848 VM_OBJECT_WUNLOCK(vp->v_object); 3849 } 3850 do { 3851 error = VOP_FSYNC(vp, MNT_WAIT, td); 3852 } while (error == ERELOOKUP); 3853 if (error != 0) { 3854 VOP_UNLOCK(vp); 3855 vdrop(vp); 3856 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3857 return (error); 3858 } 3859 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3860 VI_LOCK(vp); 3861 3862 if ((vp->v_type == VNON || 3863 (error == 0 && vattr.va_nlink > 0)) && 3864 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 3865 VOP_UNLOCK(vp); 3866 vdropl(vp); 3867 continue; 3868 } 3869 } else 3870 VI_LOCK(vp); 3871 /* 3872 * With v_usecount == 0, all we need to do is clear out the 3873 * vnode data structures and we are done. 3874 * 3875 * If FORCECLOSE is set, forcibly close the vnode. 3876 */ 3877 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3878 vgonel(vp); 3879 } else { 3880 busy++; 3881 #ifdef DIAGNOSTIC 3882 if (busyprt) 3883 vn_printf(vp, "vflush: busy vnode "); 3884 #endif 3885 } 3886 VOP_UNLOCK(vp); 3887 vdropl(vp); 3888 } 3889 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3890 /* 3891 * If just the root vnode is busy, and if its refcount 3892 * is equal to `rootrefs', then go ahead and kill it. 3893 */ 3894 VI_LOCK(rootvp); 3895 KASSERT(busy > 0, ("vflush: not busy")); 3896 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3897 ("vflush: usecount %d < rootrefs %d", 3898 rootvp->v_usecount, rootrefs)); 3899 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3900 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3901 vgone(rootvp); 3902 VOP_UNLOCK(rootvp); 3903 busy = 0; 3904 } else 3905 VI_UNLOCK(rootvp); 3906 } 3907 if (busy) { 3908 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3909 busy); 3910 return (EBUSY); 3911 } 3912 for (; rootrefs > 0; rootrefs--) 3913 vrele(rootvp); 3914 return (0); 3915 } 3916 3917 /* 3918 * Recycle an unused vnode to the front of the free list. 3919 */ 3920 int 3921 vrecycle(struct vnode *vp) 3922 { 3923 int recycled; 3924 3925 VI_LOCK(vp); 3926 recycled = vrecyclel(vp); 3927 VI_UNLOCK(vp); 3928 return (recycled); 3929 } 3930 3931 /* 3932 * vrecycle, with the vp interlock held. 3933 */ 3934 int 3935 vrecyclel(struct vnode *vp) 3936 { 3937 int recycled; 3938 3939 ASSERT_VOP_ELOCKED(vp, __func__); 3940 ASSERT_VI_LOCKED(vp, __func__); 3941 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3942 recycled = 0; 3943 if (vp->v_usecount == 0) { 3944 recycled = 1; 3945 vgonel(vp); 3946 } 3947 return (recycled); 3948 } 3949 3950 /* 3951 * Eliminate all activity associated with a vnode 3952 * in preparation for reuse. 3953 */ 3954 void 3955 vgone(struct vnode *vp) 3956 { 3957 VI_LOCK(vp); 3958 vgonel(vp); 3959 VI_UNLOCK(vp); 3960 } 3961 3962 /* 3963 * Notify upper mounts about reclaimed or unlinked vnode. 3964 */ 3965 void 3966 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) 3967 { 3968 struct mount *mp; 3969 struct mount_upper_node *ump; 3970 3971 mp = atomic_load_ptr(&vp->v_mount); 3972 if (mp == NULL) 3973 return; 3974 if (TAILQ_EMPTY(&mp->mnt_notify)) 3975 return; 3976 3977 MNT_ILOCK(mp); 3978 mp->mnt_upper_pending++; 3979 KASSERT(mp->mnt_upper_pending > 0, 3980 ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); 3981 TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { 3982 MNT_IUNLOCK(mp); 3983 switch (event) { 3984 case VFS_NOTIFY_UPPER_RECLAIM: 3985 VFS_RECLAIM_LOWERVP(ump->mp, vp); 3986 break; 3987 case VFS_NOTIFY_UPPER_UNLINK: 3988 VFS_UNLINK_LOWERVP(ump->mp, vp); 3989 break; 3990 } 3991 MNT_ILOCK(mp); 3992 } 3993 mp->mnt_upper_pending--; 3994 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 3995 mp->mnt_upper_pending == 0) { 3996 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 3997 wakeup(&mp->mnt_uppers); 3998 } 3999 MNT_IUNLOCK(mp); 4000 } 4001 4002 /* 4003 * vgone, with the vp interlock held. 4004 */ 4005 static void 4006 vgonel(struct vnode *vp) 4007 { 4008 struct thread *td; 4009 struct mount *mp; 4010 vm_object_t object; 4011 bool active, doinginact, oweinact; 4012 4013 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4014 ASSERT_VI_LOCKED(vp, "vgonel"); 4015 VNASSERT(vp->v_holdcnt, vp, 4016 ("vgonel: vp %p has no reference.", vp)); 4017 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4018 td = curthread; 4019 4020 /* 4021 * Don't vgonel if we're already doomed. 4022 */ 4023 if (VN_IS_DOOMED(vp)) { 4024 VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \ 4025 vn_get_state(vp) == VSTATE_DEAD, vp); 4026 return; 4027 } 4028 /* 4029 * Paired with freevnode. 4030 */ 4031 vn_seqc_write_begin_locked(vp); 4032 vunlazy_gone(vp); 4033 vn_irflag_set_locked(vp, VIRF_DOOMED); 4034 vn_set_state(vp, VSTATE_DESTROYING); 4035 4036 /* 4037 * Check to see if the vnode is in use. If so, we have to 4038 * call VOP_CLOSE() and VOP_INACTIVE(). 4039 * 4040 * It could be that VOP_INACTIVE() requested reclamation, in 4041 * which case we should avoid recursion, so check 4042 * VI_DOINGINACT. This is not precise but good enough. 4043 */ 4044 active = vp->v_usecount > 0; 4045 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4046 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; 4047 4048 /* 4049 * If we need to do inactive VI_OWEINACT will be set. 4050 */ 4051 if (vp->v_iflag & VI_DEFINACT) { 4052 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4053 vp->v_iflag &= ~VI_DEFINACT; 4054 vdropl(vp); 4055 } else { 4056 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4057 VI_UNLOCK(vp); 4058 } 4059 cache_purge_vgone(vp); 4060 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4061 4062 /* 4063 * If purging an active vnode, it must be closed and 4064 * deactivated before being reclaimed. 4065 */ 4066 if (active) 4067 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4068 if (!doinginact) { 4069 do { 4070 if (oweinact || active) { 4071 VI_LOCK(vp); 4072 vinactivef(vp); 4073 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4074 VI_UNLOCK(vp); 4075 } 4076 } while (oweinact); 4077 } 4078 if (vp->v_type == VSOCK) 4079 vfs_unp_reclaim(vp); 4080 4081 /* 4082 * Clean out any buffers associated with the vnode. 4083 * If the flush fails, just toss the buffers. 4084 */ 4085 mp = NULL; 4086 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4087 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4088 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4089 while (vinvalbuf(vp, 0, 0, 0) != 0) 4090 ; 4091 } 4092 4093 BO_LOCK(&vp->v_bufobj); 4094 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4095 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4096 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4097 vp->v_bufobj.bo_clean.bv_cnt == 0, 4098 ("vp %p bufobj not invalidated", vp)); 4099 4100 /* 4101 * For VMIO bufobj, BO_DEAD is set later, or in 4102 * vm_object_terminate() after the object's page queue is 4103 * flushed. 4104 */ 4105 object = vp->v_bufobj.bo_object; 4106 if (object == NULL) 4107 vp->v_bufobj.bo_flag |= BO_DEAD; 4108 BO_UNLOCK(&vp->v_bufobj); 4109 4110 /* 4111 * Handle the VM part. Tmpfs handles v_object on its own (the 4112 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4113 * should not touch the object borrowed from the lower vnode 4114 * (the handle check). 4115 */ 4116 if (object != NULL && object->type == OBJT_VNODE && 4117 object->handle == vp) 4118 vnode_destroy_vobject(vp); 4119 4120 /* 4121 * Reclaim the vnode. 4122 */ 4123 if (VOP_RECLAIM(vp)) 4124 panic("vgone: cannot reclaim"); 4125 if (mp != NULL) 4126 vn_finished_secondary_write(mp); 4127 VNASSERT(vp->v_object == NULL, vp, 4128 ("vop_reclaim left v_object vp=%p", vp)); 4129 /* 4130 * Clear the advisory locks and wake up waiting threads. 4131 */ 4132 if (vp->v_lockf != NULL) { 4133 (void)VOP_ADVLOCKPURGE(vp); 4134 vp->v_lockf = NULL; 4135 } 4136 /* 4137 * Delete from old mount point vnode list. 4138 */ 4139 if (vp->v_mount == NULL) { 4140 VI_LOCK(vp); 4141 } else { 4142 delmntque(vp); 4143 ASSERT_VI_LOCKED(vp, "vgonel 2"); 4144 } 4145 /* 4146 * Done with purge, reset to the standard lock and invalidate 4147 * the vnode. 4148 */ 4149 vp->v_vnlock = &vp->v_lock; 4150 vp->v_op = &dead_vnodeops; 4151 vp->v_type = VBAD; 4152 vn_set_state(vp, VSTATE_DEAD); 4153 } 4154 4155 /* 4156 * Print out a description of a vnode. 4157 */ 4158 static const char *const vtypename[] = { 4159 [VNON] = "VNON", 4160 [VREG] = "VREG", 4161 [VDIR] = "VDIR", 4162 [VBLK] = "VBLK", 4163 [VCHR] = "VCHR", 4164 [VLNK] = "VLNK", 4165 [VSOCK] = "VSOCK", 4166 [VFIFO] = "VFIFO", 4167 [VBAD] = "VBAD", 4168 [VMARKER] = "VMARKER", 4169 }; 4170 _Static_assert(nitems(vtypename) == VLASTTYPE + 1, 4171 "vnode type name not added to vtypename"); 4172 4173 static const char *const vstatename[] = { 4174 [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED", 4175 [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED", 4176 [VSTATE_DESTROYING] = "VSTATE_DESTROYING", 4177 [VSTATE_DEAD] = "VSTATE_DEAD", 4178 }; 4179 _Static_assert(nitems(vstatename) == VLASTSTATE + 1, 4180 "vnode state name not added to vstatename"); 4181 4182 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4183 "new hold count flag not added to vn_printf"); 4184 4185 void 4186 vn_printf(struct vnode *vp, const char *fmt, ...) 4187 { 4188 va_list ap; 4189 char buf[256], buf2[16]; 4190 u_long flags; 4191 u_int holdcnt; 4192 short irflag; 4193 4194 va_start(ap, fmt); 4195 vprintf(fmt, ap); 4196 va_end(ap); 4197 printf("%p: ", (void *)vp); 4198 printf("type %s state %s\n", vtypename[vp->v_type], vstatename[vp->v_state]); 4199 holdcnt = atomic_load_int(&vp->v_holdcnt); 4200 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4201 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4202 vp->v_seqc_users); 4203 switch (vp->v_type) { 4204 case VDIR: 4205 printf(" mountedhere %p\n", vp->v_mountedhere); 4206 break; 4207 case VCHR: 4208 printf(" rdev %p\n", vp->v_rdev); 4209 break; 4210 case VSOCK: 4211 printf(" socket %p\n", vp->v_unpcb); 4212 break; 4213 case VFIFO: 4214 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4215 break; 4216 default: 4217 printf("\n"); 4218 break; 4219 } 4220 buf[0] = '\0'; 4221 buf[1] = '\0'; 4222 if (holdcnt & VHOLD_NO_SMR) 4223 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4224 printf(" hold count flags (%s)\n", buf + 1); 4225 4226 buf[0] = '\0'; 4227 buf[1] = '\0'; 4228 irflag = vn_irflag_read(vp); 4229 if (irflag & VIRF_DOOMED) 4230 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4231 if (irflag & VIRF_PGREAD) 4232 strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); 4233 if (irflag & VIRF_MOUNTPOINT) 4234 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); 4235 if (irflag & VIRF_TEXT_REF) 4236 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); 4237 flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); 4238 if (flags != 0) { 4239 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4240 strlcat(buf, buf2, sizeof(buf)); 4241 } 4242 if (vp->v_vflag & VV_ROOT) 4243 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4244 if (vp->v_vflag & VV_ISTTY) 4245 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4246 if (vp->v_vflag & VV_NOSYNC) 4247 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4248 if (vp->v_vflag & VV_ETERNALDEV) 4249 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4250 if (vp->v_vflag & VV_CACHEDLABEL) 4251 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4252 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4253 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4254 if (vp->v_vflag & VV_COPYONWRITE) 4255 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4256 if (vp->v_vflag & VV_SYSTEM) 4257 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4258 if (vp->v_vflag & VV_PROCDEP) 4259 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4260 if (vp->v_vflag & VV_DELETED) 4261 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4262 if (vp->v_vflag & VV_MD) 4263 strlcat(buf, "|VV_MD", sizeof(buf)); 4264 if (vp->v_vflag & VV_FORCEINSMQ) 4265 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4266 if (vp->v_vflag & VV_READLINK) 4267 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4268 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4269 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | 4270 VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); 4271 if (flags != 0) { 4272 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4273 strlcat(buf, buf2, sizeof(buf)); 4274 } 4275 if (vp->v_iflag & VI_MOUNT) 4276 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4277 if (vp->v_iflag & VI_DOINGINACT) 4278 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4279 if (vp->v_iflag & VI_OWEINACT) 4280 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4281 if (vp->v_iflag & VI_DEFINACT) 4282 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4283 if (vp->v_iflag & VI_FOPENING) 4284 strlcat(buf, "|VI_FOPENING", sizeof(buf)); 4285 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | 4286 VI_OWEINACT | VI_DEFINACT | VI_FOPENING); 4287 if (flags != 0) { 4288 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4289 strlcat(buf, buf2, sizeof(buf)); 4290 } 4291 if (vp->v_mflag & VMP_LAZYLIST) 4292 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4293 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4294 if (flags != 0) { 4295 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4296 strlcat(buf, buf2, sizeof(buf)); 4297 } 4298 printf(" flags (%s)", buf + 1); 4299 if (mtx_owned(VI_MTX(vp))) 4300 printf(" VI_LOCKed"); 4301 printf("\n"); 4302 if (vp->v_object != NULL) 4303 printf(" v_object %p ref %d pages %d " 4304 "cleanbuf %d dirtybuf %d\n", 4305 vp->v_object, vp->v_object->ref_count, 4306 vp->v_object->resident_page_count, 4307 vp->v_bufobj.bo_clean.bv_cnt, 4308 vp->v_bufobj.bo_dirty.bv_cnt); 4309 printf(" "); 4310 lockmgr_printinfo(vp->v_vnlock); 4311 if (vp->v_data != NULL) 4312 VOP_PRINT(vp); 4313 } 4314 4315 #ifdef DDB 4316 /* 4317 * List all of the locked vnodes in the system. 4318 * Called when debugging the kernel. 4319 */ 4320 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) 4321 { 4322 struct mount *mp; 4323 struct vnode *vp; 4324 4325 /* 4326 * Note: because this is DDB, we can't obey the locking semantics 4327 * for these structures, which means we could catch an inconsistent 4328 * state and dereference a nasty pointer. Not much to be done 4329 * about that. 4330 */ 4331 db_printf("Locked vnodes\n"); 4332 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4333 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4334 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4335 vn_printf(vp, "vnode "); 4336 } 4337 } 4338 } 4339 4340 /* 4341 * Show details about the given vnode. 4342 */ 4343 DB_SHOW_COMMAND(vnode, db_show_vnode) 4344 { 4345 struct vnode *vp; 4346 4347 if (!have_addr) 4348 return; 4349 vp = (struct vnode *)addr; 4350 vn_printf(vp, "vnode "); 4351 } 4352 4353 /* 4354 * Show details about the given mount point. 4355 */ 4356 DB_SHOW_COMMAND(mount, db_show_mount) 4357 { 4358 struct mount *mp; 4359 struct vfsopt *opt; 4360 struct statfs *sp; 4361 struct vnode *vp; 4362 char buf[512]; 4363 uint64_t mflags; 4364 u_int flags; 4365 4366 if (!have_addr) { 4367 /* No address given, print short info about all mount points. */ 4368 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4369 db_printf("%p %s on %s (%s)\n", mp, 4370 mp->mnt_stat.f_mntfromname, 4371 mp->mnt_stat.f_mntonname, 4372 mp->mnt_stat.f_fstypename); 4373 if (db_pager_quit) 4374 break; 4375 } 4376 db_printf("\nMore info: show mount <addr>\n"); 4377 return; 4378 } 4379 4380 mp = (struct mount *)addr; 4381 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4382 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4383 4384 buf[0] = '\0'; 4385 mflags = mp->mnt_flag; 4386 #define MNT_FLAG(flag) do { \ 4387 if (mflags & (flag)) { \ 4388 if (buf[0] != '\0') \ 4389 strlcat(buf, ", ", sizeof(buf)); \ 4390 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4391 mflags &= ~(flag); \ 4392 } \ 4393 } while (0) 4394 MNT_FLAG(MNT_RDONLY); 4395 MNT_FLAG(MNT_SYNCHRONOUS); 4396 MNT_FLAG(MNT_NOEXEC); 4397 MNT_FLAG(MNT_NOSUID); 4398 MNT_FLAG(MNT_NFS4ACLS); 4399 MNT_FLAG(MNT_UNION); 4400 MNT_FLAG(MNT_ASYNC); 4401 MNT_FLAG(MNT_SUIDDIR); 4402 MNT_FLAG(MNT_SOFTDEP); 4403 MNT_FLAG(MNT_NOSYMFOLLOW); 4404 MNT_FLAG(MNT_GJOURNAL); 4405 MNT_FLAG(MNT_MULTILABEL); 4406 MNT_FLAG(MNT_ACLS); 4407 MNT_FLAG(MNT_NOATIME); 4408 MNT_FLAG(MNT_NOCLUSTERR); 4409 MNT_FLAG(MNT_NOCLUSTERW); 4410 MNT_FLAG(MNT_SUJ); 4411 MNT_FLAG(MNT_EXRDONLY); 4412 MNT_FLAG(MNT_EXPORTED); 4413 MNT_FLAG(MNT_DEFEXPORTED); 4414 MNT_FLAG(MNT_EXPORTANON); 4415 MNT_FLAG(MNT_EXKERB); 4416 MNT_FLAG(MNT_EXPUBLIC); 4417 MNT_FLAG(MNT_LOCAL); 4418 MNT_FLAG(MNT_QUOTA); 4419 MNT_FLAG(MNT_ROOTFS); 4420 MNT_FLAG(MNT_USER); 4421 MNT_FLAG(MNT_IGNORE); 4422 MNT_FLAG(MNT_UPDATE); 4423 MNT_FLAG(MNT_DELEXPORT); 4424 MNT_FLAG(MNT_RELOAD); 4425 MNT_FLAG(MNT_FORCE); 4426 MNT_FLAG(MNT_SNAPSHOT); 4427 MNT_FLAG(MNT_BYFSID); 4428 #undef MNT_FLAG 4429 if (mflags != 0) { 4430 if (buf[0] != '\0') 4431 strlcat(buf, ", ", sizeof(buf)); 4432 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4433 "0x%016jx", mflags); 4434 } 4435 db_printf(" mnt_flag = %s\n", buf); 4436 4437 buf[0] = '\0'; 4438 flags = mp->mnt_kern_flag; 4439 #define MNT_KERN_FLAG(flag) do { \ 4440 if (flags & (flag)) { \ 4441 if (buf[0] != '\0') \ 4442 strlcat(buf, ", ", sizeof(buf)); \ 4443 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4444 flags &= ~(flag); \ 4445 } \ 4446 } while (0) 4447 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4448 MNT_KERN_FLAG(MNTK_ASYNC); 4449 MNT_KERN_FLAG(MNTK_SOFTDEP); 4450 MNT_KERN_FLAG(MNTK_NOMSYNC); 4451 MNT_KERN_FLAG(MNTK_DRAINING); 4452 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4453 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4454 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4455 MNT_KERN_FLAG(MNTK_NO_IOPF); 4456 MNT_KERN_FLAG(MNTK_RECURSE); 4457 MNT_KERN_FLAG(MNTK_UPPER_WAITER); 4458 MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); 4459 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4460 MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); 4461 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4462 MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); 4463 MNT_KERN_FLAG(MNTK_NOASYNC); 4464 MNT_KERN_FLAG(MNTK_UNMOUNT); 4465 MNT_KERN_FLAG(MNTK_MWAIT); 4466 MNT_KERN_FLAG(MNTK_SUSPEND); 4467 MNT_KERN_FLAG(MNTK_SUSPEND2); 4468 MNT_KERN_FLAG(MNTK_SUSPENDED); 4469 MNT_KERN_FLAG(MNTK_NULL_NOCACHE); 4470 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4471 #undef MNT_KERN_FLAG 4472 if (flags != 0) { 4473 if (buf[0] != '\0') 4474 strlcat(buf, ", ", sizeof(buf)); 4475 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4476 "0x%08x", flags); 4477 } 4478 db_printf(" mnt_kern_flag = %s\n", buf); 4479 4480 db_printf(" mnt_opt = "); 4481 opt = TAILQ_FIRST(mp->mnt_opt); 4482 if (opt != NULL) { 4483 db_printf("%s", opt->name); 4484 opt = TAILQ_NEXT(opt, link); 4485 while (opt != NULL) { 4486 db_printf(", %s", opt->name); 4487 opt = TAILQ_NEXT(opt, link); 4488 } 4489 } 4490 db_printf("\n"); 4491 4492 sp = &mp->mnt_stat; 4493 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4494 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4495 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4496 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4497 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4498 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4499 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4500 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4501 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4502 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4503 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4504 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4505 4506 db_printf(" mnt_cred = { uid=%u ruid=%u", 4507 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4508 if (jailed(mp->mnt_cred)) 4509 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4510 db_printf(" }\n"); 4511 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4512 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4513 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4514 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4515 db_printf(" mnt_lazyvnodelistsize = %d\n", 4516 mp->mnt_lazyvnodelistsize); 4517 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4518 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4519 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4520 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4521 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4522 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4523 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4524 db_printf(" mnt_secondary_accwrites = %d\n", 4525 mp->mnt_secondary_accwrites); 4526 db_printf(" mnt_gjprovider = %s\n", 4527 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4528 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4529 4530 db_printf("\n\nList of active vnodes\n"); 4531 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4532 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4533 vn_printf(vp, "vnode "); 4534 if (db_pager_quit) 4535 break; 4536 } 4537 } 4538 db_printf("\n\nList of inactive vnodes\n"); 4539 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4540 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4541 vn_printf(vp, "vnode "); 4542 if (db_pager_quit) 4543 break; 4544 } 4545 } 4546 } 4547 #endif /* DDB */ 4548 4549 /* 4550 * Fill in a struct xvfsconf based on a struct vfsconf. 4551 */ 4552 static int 4553 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4554 { 4555 struct xvfsconf xvfsp; 4556 4557 bzero(&xvfsp, sizeof(xvfsp)); 4558 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4559 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4560 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4561 xvfsp.vfc_flags = vfsp->vfc_flags; 4562 /* 4563 * These are unused in userland, we keep them 4564 * to not break binary compatibility. 4565 */ 4566 xvfsp.vfc_vfsops = NULL; 4567 xvfsp.vfc_next = NULL; 4568 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4569 } 4570 4571 #ifdef COMPAT_FREEBSD32 4572 struct xvfsconf32 { 4573 uint32_t vfc_vfsops; 4574 char vfc_name[MFSNAMELEN]; 4575 int32_t vfc_typenum; 4576 int32_t vfc_refcount; 4577 int32_t vfc_flags; 4578 uint32_t vfc_next; 4579 }; 4580 4581 static int 4582 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4583 { 4584 struct xvfsconf32 xvfsp; 4585 4586 bzero(&xvfsp, sizeof(xvfsp)); 4587 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4588 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4589 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4590 xvfsp.vfc_flags = vfsp->vfc_flags; 4591 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4592 } 4593 #endif 4594 4595 /* 4596 * Top level filesystem related information gathering. 4597 */ 4598 static int 4599 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4600 { 4601 struct vfsconf *vfsp; 4602 int error; 4603 4604 error = 0; 4605 vfsconf_slock(); 4606 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4607 #ifdef COMPAT_FREEBSD32 4608 if (req->flags & SCTL_MASK32) 4609 error = vfsconf2x32(req, vfsp); 4610 else 4611 #endif 4612 error = vfsconf2x(req, vfsp); 4613 if (error) 4614 break; 4615 } 4616 vfsconf_sunlock(); 4617 return (error); 4618 } 4619 4620 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4621 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4622 "S,xvfsconf", "List of all configured filesystems"); 4623 4624 #ifndef BURN_BRIDGES 4625 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4626 4627 static int 4628 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4629 { 4630 int *name = (int *)arg1 - 1; /* XXX */ 4631 u_int namelen = arg2 + 1; /* XXX */ 4632 struct vfsconf *vfsp; 4633 4634 log(LOG_WARNING, "userland calling deprecated sysctl, " 4635 "please rebuild world\n"); 4636 4637 #if 1 || defined(COMPAT_PRELITE2) 4638 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4639 if (namelen == 1) 4640 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4641 #endif 4642 4643 switch (name[1]) { 4644 case VFS_MAXTYPENUM: 4645 if (namelen != 2) 4646 return (ENOTDIR); 4647 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4648 case VFS_CONF: 4649 if (namelen != 3) 4650 return (ENOTDIR); /* overloaded */ 4651 vfsconf_slock(); 4652 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4653 if (vfsp->vfc_typenum == name[2]) 4654 break; 4655 } 4656 vfsconf_sunlock(); 4657 if (vfsp == NULL) 4658 return (EOPNOTSUPP); 4659 #ifdef COMPAT_FREEBSD32 4660 if (req->flags & SCTL_MASK32) 4661 return (vfsconf2x32(req, vfsp)); 4662 else 4663 #endif 4664 return (vfsconf2x(req, vfsp)); 4665 } 4666 return (EOPNOTSUPP); 4667 } 4668 4669 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 4670 CTLFLAG_MPSAFE, vfs_sysctl, 4671 "Generic filesystem"); 4672 4673 #if 1 || defined(COMPAT_PRELITE2) 4674 4675 static int 4676 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 4677 { 4678 int error; 4679 struct vfsconf *vfsp; 4680 struct ovfsconf ovfs; 4681 4682 vfsconf_slock(); 4683 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4684 bzero(&ovfs, sizeof(ovfs)); 4685 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 4686 strcpy(ovfs.vfc_name, vfsp->vfc_name); 4687 ovfs.vfc_index = vfsp->vfc_typenum; 4688 ovfs.vfc_refcount = vfsp->vfc_refcount; 4689 ovfs.vfc_flags = vfsp->vfc_flags; 4690 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 4691 if (error != 0) { 4692 vfsconf_sunlock(); 4693 return (error); 4694 } 4695 } 4696 vfsconf_sunlock(); 4697 return (0); 4698 } 4699 4700 #endif /* 1 || COMPAT_PRELITE2 */ 4701 #endif /* !BURN_BRIDGES */ 4702 4703 #define KINFO_VNODESLOP 10 4704 #ifdef notyet 4705 /* 4706 * Dump vnode list (via sysctl). 4707 */ 4708 /* ARGSUSED */ 4709 static int 4710 sysctl_vnode(SYSCTL_HANDLER_ARGS) 4711 { 4712 struct xvnode *xvn; 4713 struct mount *mp; 4714 struct vnode *vp; 4715 int error, len, n; 4716 4717 /* 4718 * Stale numvnodes access is not fatal here. 4719 */ 4720 req->lock = 0; 4721 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 4722 if (!req->oldptr) 4723 /* Make an estimate */ 4724 return (SYSCTL_OUT(req, 0, len)); 4725 4726 error = sysctl_wire_old_buffer(req, 0); 4727 if (error != 0) 4728 return (error); 4729 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 4730 n = 0; 4731 mtx_lock(&mountlist_mtx); 4732 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4733 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 4734 continue; 4735 MNT_ILOCK(mp); 4736 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4737 if (n == len) 4738 break; 4739 vref(vp); 4740 xvn[n].xv_size = sizeof *xvn; 4741 xvn[n].xv_vnode = vp; 4742 xvn[n].xv_id = 0; /* XXX compat */ 4743 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 4744 XV_COPY(usecount); 4745 XV_COPY(writecount); 4746 XV_COPY(holdcnt); 4747 XV_COPY(mount); 4748 XV_COPY(numoutput); 4749 XV_COPY(type); 4750 #undef XV_COPY 4751 xvn[n].xv_flag = vp->v_vflag; 4752 4753 switch (vp->v_type) { 4754 case VREG: 4755 case VDIR: 4756 case VLNK: 4757 break; 4758 case VBLK: 4759 case VCHR: 4760 if (vp->v_rdev == NULL) { 4761 vrele(vp); 4762 continue; 4763 } 4764 xvn[n].xv_dev = dev2udev(vp->v_rdev); 4765 break; 4766 case VSOCK: 4767 xvn[n].xv_socket = vp->v_socket; 4768 break; 4769 case VFIFO: 4770 xvn[n].xv_fifo = vp->v_fifoinfo; 4771 break; 4772 case VNON: 4773 case VBAD: 4774 default: 4775 /* shouldn't happen? */ 4776 vrele(vp); 4777 continue; 4778 } 4779 vrele(vp); 4780 ++n; 4781 } 4782 MNT_IUNLOCK(mp); 4783 mtx_lock(&mountlist_mtx); 4784 vfs_unbusy(mp); 4785 if (n == len) 4786 break; 4787 } 4788 mtx_unlock(&mountlist_mtx); 4789 4790 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 4791 free(xvn, M_TEMP); 4792 return (error); 4793 } 4794 4795 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 4796 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 4797 ""); 4798 #endif 4799 4800 static void 4801 unmount_or_warn(struct mount *mp) 4802 { 4803 int error; 4804 4805 error = dounmount(mp, MNT_FORCE, curthread); 4806 if (error != 0) { 4807 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4808 if (error == EBUSY) 4809 printf("BUSY)\n"); 4810 else 4811 printf("%d)\n", error); 4812 } 4813 } 4814 4815 /* 4816 * Unmount all filesystems. The list is traversed in reverse order 4817 * of mounting to avoid dependencies. 4818 */ 4819 void 4820 vfs_unmountall(void) 4821 { 4822 struct mount *mp, *tmp; 4823 4824 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4825 4826 /* 4827 * Since this only runs when rebooting, it is not interlocked. 4828 */ 4829 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 4830 vfs_ref(mp); 4831 4832 /* 4833 * Forcibly unmounting "/dev" before "/" would prevent clean 4834 * unmount of the latter. 4835 */ 4836 if (mp == rootdevmp) 4837 continue; 4838 4839 unmount_or_warn(mp); 4840 } 4841 4842 if (rootdevmp != NULL) 4843 unmount_or_warn(rootdevmp); 4844 } 4845 4846 static void 4847 vfs_deferred_inactive(struct vnode *vp, int lkflags) 4848 { 4849 4850 ASSERT_VI_LOCKED(vp, __func__); 4851 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); 4852 if ((vp->v_iflag & VI_OWEINACT) == 0) { 4853 vdropl(vp); 4854 return; 4855 } 4856 if (vn_lock(vp, lkflags) == 0) { 4857 VI_LOCK(vp); 4858 vinactive(vp); 4859 VOP_UNLOCK(vp); 4860 vdropl(vp); 4861 return; 4862 } 4863 vdefer_inactive_unlocked(vp); 4864 } 4865 4866 static int 4867 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 4868 { 4869 4870 return (vp->v_iflag & VI_DEFINACT); 4871 } 4872 4873 static void __noinline 4874 vfs_periodic_inactive(struct mount *mp, int flags) 4875 { 4876 struct vnode *vp, *mvp; 4877 int lkflags; 4878 4879 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4880 if (flags != MNT_WAIT) 4881 lkflags |= LK_NOWAIT; 4882 4883 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 4884 if ((vp->v_iflag & VI_DEFINACT) == 0) { 4885 VI_UNLOCK(vp); 4886 continue; 4887 } 4888 vp->v_iflag &= ~VI_DEFINACT; 4889 vfs_deferred_inactive(vp, lkflags); 4890 } 4891 } 4892 4893 static inline bool 4894 vfs_want_msync(struct vnode *vp) 4895 { 4896 struct vm_object *obj; 4897 4898 /* 4899 * This test may be performed without any locks held. 4900 * We rely on vm_object's type stability. 4901 */ 4902 if (vp->v_vflag & VV_NOSYNC) 4903 return (false); 4904 obj = vp->v_object; 4905 return (obj != NULL && vm_object_mightbedirty(obj)); 4906 } 4907 4908 static int 4909 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 4910 { 4911 4912 if (vp->v_vflag & VV_NOSYNC) 4913 return (false); 4914 if (vp->v_iflag & VI_DEFINACT) 4915 return (true); 4916 return (vfs_want_msync(vp)); 4917 } 4918 4919 static void __noinline 4920 vfs_periodic_msync_inactive(struct mount *mp, int flags) 4921 { 4922 struct vnode *vp, *mvp; 4923 struct vm_object *obj; 4924 int lkflags, objflags; 4925 bool seen_defer; 4926 4927 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4928 if (flags != MNT_WAIT) { 4929 lkflags |= LK_NOWAIT; 4930 objflags = OBJPC_NOSYNC; 4931 } else { 4932 objflags = OBJPC_SYNC; 4933 } 4934 4935 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 4936 seen_defer = false; 4937 if (vp->v_iflag & VI_DEFINACT) { 4938 vp->v_iflag &= ~VI_DEFINACT; 4939 seen_defer = true; 4940 } 4941 if (!vfs_want_msync(vp)) { 4942 if (seen_defer) 4943 vfs_deferred_inactive(vp, lkflags); 4944 else 4945 VI_UNLOCK(vp); 4946 continue; 4947 } 4948 if (vget(vp, lkflags) == 0) { 4949 obj = vp->v_object; 4950 if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { 4951 VM_OBJECT_WLOCK(obj); 4952 vm_object_page_clean(obj, 0, 0, objflags); 4953 VM_OBJECT_WUNLOCK(obj); 4954 } 4955 vput(vp); 4956 if (seen_defer) 4957 vdrop(vp); 4958 } else { 4959 if (seen_defer) 4960 vdefer_inactive_unlocked(vp); 4961 } 4962 } 4963 } 4964 4965 void 4966 vfs_periodic(struct mount *mp, int flags) 4967 { 4968 4969 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 4970 4971 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 4972 vfs_periodic_inactive(mp, flags); 4973 else 4974 vfs_periodic_msync_inactive(mp, flags); 4975 } 4976 4977 static void 4978 destroy_vpollinfo_free(struct vpollinfo *vi) 4979 { 4980 4981 knlist_destroy(&vi->vpi_selinfo.si_note); 4982 mtx_destroy(&vi->vpi_lock); 4983 free(vi, M_VNODEPOLL); 4984 } 4985 4986 static void 4987 destroy_vpollinfo(struct vpollinfo *vi) 4988 { 4989 4990 knlist_clear(&vi->vpi_selinfo.si_note, 1); 4991 seldrain(&vi->vpi_selinfo); 4992 destroy_vpollinfo_free(vi); 4993 } 4994 4995 /* 4996 * Initialize per-vnode helper structure to hold poll-related state. 4997 */ 4998 void 4999 v_addpollinfo(struct vnode *vp) 5000 { 5001 struct vpollinfo *vi; 5002 5003 if (vp->v_pollinfo != NULL) 5004 return; 5005 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); 5006 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 5007 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 5008 vfs_knlunlock, vfs_knl_assert_lock); 5009 VI_LOCK(vp); 5010 if (vp->v_pollinfo != NULL) { 5011 VI_UNLOCK(vp); 5012 destroy_vpollinfo_free(vi); 5013 return; 5014 } 5015 vp->v_pollinfo = vi; 5016 VI_UNLOCK(vp); 5017 } 5018 5019 /* 5020 * Record a process's interest in events which might happen to 5021 * a vnode. Because poll uses the historic select-style interface 5022 * internally, this routine serves as both the ``check for any 5023 * pending events'' and the ``record my interest in future events'' 5024 * functions. (These are done together, while the lock is held, 5025 * to avoid race conditions.) 5026 */ 5027 int 5028 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 5029 { 5030 5031 v_addpollinfo(vp); 5032 mtx_lock(&vp->v_pollinfo->vpi_lock); 5033 if (vp->v_pollinfo->vpi_revents & events) { 5034 /* 5035 * This leaves events we are not interested 5036 * in available for the other process which 5037 * which presumably had requested them 5038 * (otherwise they would never have been 5039 * recorded). 5040 */ 5041 events &= vp->v_pollinfo->vpi_revents; 5042 vp->v_pollinfo->vpi_revents &= ~events; 5043 5044 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5045 return (events); 5046 } 5047 vp->v_pollinfo->vpi_events |= events; 5048 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 5049 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5050 return (0); 5051 } 5052 5053 /* 5054 * Routine to create and manage a filesystem syncer vnode. 5055 */ 5056 #define sync_close ((int (*)(struct vop_close_args *))nullop) 5057 static int sync_fsync(struct vop_fsync_args *); 5058 static int sync_inactive(struct vop_inactive_args *); 5059 static int sync_reclaim(struct vop_reclaim_args *); 5060 5061 static struct vop_vector sync_vnodeops = { 5062 .vop_bypass = VOP_EOPNOTSUPP, 5063 .vop_close = sync_close, /* close */ 5064 .vop_fsync = sync_fsync, /* fsync */ 5065 .vop_inactive = sync_inactive, /* inactive */ 5066 .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ 5067 .vop_reclaim = sync_reclaim, /* reclaim */ 5068 .vop_lock1 = vop_stdlock, /* lock */ 5069 .vop_unlock = vop_stdunlock, /* unlock */ 5070 .vop_islocked = vop_stdislocked, /* islocked */ 5071 }; 5072 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 5073 5074 /* 5075 * Create a new filesystem syncer vnode for the specified mount point. 5076 */ 5077 void 5078 vfs_allocate_syncvnode(struct mount *mp) 5079 { 5080 struct vnode *vp; 5081 struct bufobj *bo; 5082 static long start, incr, next; 5083 int error; 5084 5085 /* Allocate a new vnode */ 5086 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 5087 if (error != 0) 5088 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 5089 vp->v_type = VNON; 5090 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5091 vp->v_vflag |= VV_FORCEINSMQ; 5092 error = insmntque1(vp, mp); 5093 if (error != 0) 5094 panic("vfs_allocate_syncvnode: insmntque() failed"); 5095 vp->v_vflag &= ~VV_FORCEINSMQ; 5096 vn_set_state(vp, VSTATE_CONSTRUCTED); 5097 VOP_UNLOCK(vp); 5098 /* 5099 * Place the vnode onto the syncer worklist. We attempt to 5100 * scatter them about on the list so that they will go off 5101 * at evenly distributed times even if all the filesystems 5102 * are mounted at once. 5103 */ 5104 next += incr; 5105 if (next == 0 || next > syncer_maxdelay) { 5106 start /= 2; 5107 incr /= 2; 5108 if (start == 0) { 5109 start = syncer_maxdelay / 2; 5110 incr = syncer_maxdelay; 5111 } 5112 next = start; 5113 } 5114 bo = &vp->v_bufobj; 5115 BO_LOCK(bo); 5116 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5117 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5118 mtx_lock(&sync_mtx); 5119 sync_vnode_count++; 5120 if (mp->mnt_syncer == NULL) { 5121 mp->mnt_syncer = vp; 5122 vp = NULL; 5123 } 5124 mtx_unlock(&sync_mtx); 5125 BO_UNLOCK(bo); 5126 if (vp != NULL) { 5127 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5128 vgone(vp); 5129 vput(vp); 5130 } 5131 } 5132 5133 void 5134 vfs_deallocate_syncvnode(struct mount *mp) 5135 { 5136 struct vnode *vp; 5137 5138 mtx_lock(&sync_mtx); 5139 vp = mp->mnt_syncer; 5140 if (vp != NULL) 5141 mp->mnt_syncer = NULL; 5142 mtx_unlock(&sync_mtx); 5143 if (vp != NULL) 5144 vrele(vp); 5145 } 5146 5147 /* 5148 * Do a lazy sync of the filesystem. 5149 */ 5150 static int 5151 sync_fsync(struct vop_fsync_args *ap) 5152 { 5153 struct vnode *syncvp = ap->a_vp; 5154 struct mount *mp = syncvp->v_mount; 5155 int error, save; 5156 struct bufobj *bo; 5157 5158 /* 5159 * We only need to do something if this is a lazy evaluation. 5160 */ 5161 if (ap->a_waitfor != MNT_LAZY) 5162 return (0); 5163 5164 /* 5165 * Move ourselves to the back of the sync list. 5166 */ 5167 bo = &syncvp->v_bufobj; 5168 BO_LOCK(bo); 5169 vn_syncer_add_to_worklist(bo, syncdelay); 5170 BO_UNLOCK(bo); 5171 5172 /* 5173 * Walk the list of vnodes pushing all that are dirty and 5174 * not already on the sync list. 5175 */ 5176 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5177 return (0); 5178 VOP_UNLOCK(syncvp); 5179 save = curthread_pflags_set(TDP_SYNCIO); 5180 /* 5181 * The filesystem at hand may be idle with free vnodes stored in the 5182 * batch. Return them instead of letting them stay there indefinitely. 5183 */ 5184 vfs_periodic(mp, MNT_NOWAIT); 5185 error = VFS_SYNC(mp, MNT_LAZY); 5186 curthread_pflags_restore(save); 5187 vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); 5188 vfs_unbusy(mp); 5189 return (error); 5190 } 5191 5192 /* 5193 * The syncer vnode is no referenced. 5194 */ 5195 static int 5196 sync_inactive(struct vop_inactive_args *ap) 5197 { 5198 5199 vgone(ap->a_vp); 5200 return (0); 5201 } 5202 5203 /* 5204 * The syncer vnode is no longer needed and is being decommissioned. 5205 * 5206 * Modifications to the worklist must be protected by sync_mtx. 5207 */ 5208 static int 5209 sync_reclaim(struct vop_reclaim_args *ap) 5210 { 5211 struct vnode *vp = ap->a_vp; 5212 struct bufobj *bo; 5213 5214 bo = &vp->v_bufobj; 5215 BO_LOCK(bo); 5216 mtx_lock(&sync_mtx); 5217 if (vp->v_mount->mnt_syncer == vp) 5218 vp->v_mount->mnt_syncer = NULL; 5219 if (bo->bo_flag & BO_ONWORKLST) { 5220 LIST_REMOVE(bo, bo_synclist); 5221 syncer_worklist_len--; 5222 sync_vnode_count--; 5223 bo->bo_flag &= ~BO_ONWORKLST; 5224 } 5225 mtx_unlock(&sync_mtx); 5226 BO_UNLOCK(bo); 5227 5228 return (0); 5229 } 5230 5231 int 5232 vn_need_pageq_flush(struct vnode *vp) 5233 { 5234 struct vm_object *obj; 5235 5236 obj = vp->v_object; 5237 return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5238 vm_object_mightbedirty(obj)); 5239 } 5240 5241 /* 5242 * Check if vnode represents a disk device 5243 */ 5244 bool 5245 vn_isdisk_error(struct vnode *vp, int *errp) 5246 { 5247 int error; 5248 5249 if (vp->v_type != VCHR) { 5250 error = ENOTBLK; 5251 goto out; 5252 } 5253 error = 0; 5254 dev_lock(); 5255 if (vp->v_rdev == NULL) 5256 error = ENXIO; 5257 else if (vp->v_rdev->si_devsw == NULL) 5258 error = ENXIO; 5259 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5260 error = ENOTBLK; 5261 dev_unlock(); 5262 out: 5263 *errp = error; 5264 return (error == 0); 5265 } 5266 5267 bool 5268 vn_isdisk(struct vnode *vp) 5269 { 5270 int error; 5271 5272 return (vn_isdisk_error(vp, &error)); 5273 } 5274 5275 /* 5276 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5277 * the comment above cache_fplookup for details. 5278 */ 5279 int 5280 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5281 { 5282 int error; 5283 5284 VFS_SMR_ASSERT_ENTERED(); 5285 5286 /* Check the owner. */ 5287 if (cred->cr_uid == file_uid) { 5288 if (file_mode & S_IXUSR) 5289 return (0); 5290 goto out_error; 5291 } 5292 5293 /* Otherwise, check the groups (first match) */ 5294 if (groupmember(file_gid, cred)) { 5295 if (file_mode & S_IXGRP) 5296 return (0); 5297 goto out_error; 5298 } 5299 5300 /* Otherwise, check everyone else. */ 5301 if (file_mode & S_IXOTH) 5302 return (0); 5303 out_error: 5304 /* 5305 * Permission check failed, but it is possible denial will get overwritten 5306 * (e.g., when root is traversing through a 700 directory owned by someone 5307 * else). 5308 * 5309 * vaccess() calls priv_check_cred which in turn can descent into MAC 5310 * modules overriding this result. It's quite unclear what semantics 5311 * are allowed for them to operate, thus for safety we don't call them 5312 * from within the SMR section. This also means if any such modules 5313 * are present, we have to let the regular lookup decide. 5314 */ 5315 error = priv_check_cred_vfs_lookup_nomac(cred); 5316 switch (error) { 5317 case 0: 5318 return (0); 5319 case EAGAIN: 5320 /* 5321 * MAC modules present. 5322 */ 5323 return (EAGAIN); 5324 case EPERM: 5325 return (EACCES); 5326 default: 5327 return (error); 5328 } 5329 } 5330 5331 /* 5332 * Common filesystem object access control check routine. Accepts a 5333 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5334 * Returns 0 on success, or an errno on failure. 5335 */ 5336 int 5337 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5338 accmode_t accmode, struct ucred *cred) 5339 { 5340 accmode_t dac_granted; 5341 accmode_t priv_granted; 5342 5343 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5344 ("invalid bit in accmode")); 5345 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5346 ("VAPPEND without VWRITE")); 5347 5348 /* 5349 * Look for a normal, non-privileged way to access the file/directory 5350 * as requested. If it exists, go with that. 5351 */ 5352 5353 dac_granted = 0; 5354 5355 /* Check the owner. */ 5356 if (cred->cr_uid == file_uid) { 5357 dac_granted |= VADMIN; 5358 if (file_mode & S_IXUSR) 5359 dac_granted |= VEXEC; 5360 if (file_mode & S_IRUSR) 5361 dac_granted |= VREAD; 5362 if (file_mode & S_IWUSR) 5363 dac_granted |= (VWRITE | VAPPEND); 5364 5365 if ((accmode & dac_granted) == accmode) 5366 return (0); 5367 5368 goto privcheck; 5369 } 5370 5371 /* Otherwise, check the groups (first match) */ 5372 if (groupmember(file_gid, cred)) { 5373 if (file_mode & S_IXGRP) 5374 dac_granted |= VEXEC; 5375 if (file_mode & S_IRGRP) 5376 dac_granted |= VREAD; 5377 if (file_mode & S_IWGRP) 5378 dac_granted |= (VWRITE | VAPPEND); 5379 5380 if ((accmode & dac_granted) == accmode) 5381 return (0); 5382 5383 goto privcheck; 5384 } 5385 5386 /* Otherwise, check everyone else. */ 5387 if (file_mode & S_IXOTH) 5388 dac_granted |= VEXEC; 5389 if (file_mode & S_IROTH) 5390 dac_granted |= VREAD; 5391 if (file_mode & S_IWOTH) 5392 dac_granted |= (VWRITE | VAPPEND); 5393 if ((accmode & dac_granted) == accmode) 5394 return (0); 5395 5396 privcheck: 5397 /* 5398 * Build a privilege mask to determine if the set of privileges 5399 * satisfies the requirements when combined with the granted mask 5400 * from above. For each privilege, if the privilege is required, 5401 * bitwise or the request type onto the priv_granted mask. 5402 */ 5403 priv_granted = 0; 5404 5405 if (type == VDIR) { 5406 /* 5407 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5408 * requests, instead of PRIV_VFS_EXEC. 5409 */ 5410 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5411 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5412 priv_granted |= VEXEC; 5413 } else { 5414 /* 5415 * Ensure that at least one execute bit is on. Otherwise, 5416 * a privileged user will always succeed, and we don't want 5417 * this to happen unless the file really is executable. 5418 */ 5419 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5420 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5421 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5422 priv_granted |= VEXEC; 5423 } 5424 5425 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5426 !priv_check_cred(cred, PRIV_VFS_READ)) 5427 priv_granted |= VREAD; 5428 5429 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5430 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5431 priv_granted |= (VWRITE | VAPPEND); 5432 5433 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5434 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5435 priv_granted |= VADMIN; 5436 5437 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5438 return (0); 5439 } 5440 5441 return ((accmode & VADMIN) ? EPERM : EACCES); 5442 } 5443 5444 /* 5445 * Credential check based on process requesting service, and per-attribute 5446 * permissions. 5447 */ 5448 int 5449 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5450 struct thread *td, accmode_t accmode) 5451 { 5452 5453 /* 5454 * Kernel-invoked always succeeds. 5455 */ 5456 if (cred == NOCRED) 5457 return (0); 5458 5459 /* 5460 * Do not allow privileged processes in jail to directly manipulate 5461 * system attributes. 5462 */ 5463 switch (attrnamespace) { 5464 case EXTATTR_NAMESPACE_SYSTEM: 5465 /* Potentially should be: return (EPERM); */ 5466 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5467 case EXTATTR_NAMESPACE_USER: 5468 return (VOP_ACCESS(vp, accmode, cred, td)); 5469 default: 5470 return (EPERM); 5471 } 5472 } 5473 5474 #ifdef DEBUG_VFS_LOCKS 5475 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5476 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5477 "Drop into debugger on lock violation"); 5478 5479 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5480 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5481 0, "Check for interlock across VOPs"); 5482 5483 int vfs_badlock_print = 1; /* Print lock violations. */ 5484 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5485 0, "Print lock violations"); 5486 5487 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5488 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5489 0, "Print vnode details on lock violations"); 5490 5491 #ifdef KDB 5492 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5493 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5494 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5495 #endif 5496 5497 static void 5498 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5499 { 5500 5501 #ifdef KDB 5502 if (vfs_badlock_backtrace) 5503 kdb_backtrace(); 5504 #endif 5505 if (vfs_badlock_vnode) 5506 vn_printf(vp, "vnode "); 5507 if (vfs_badlock_print) 5508 printf("%s: %p %s\n", str, (void *)vp, msg); 5509 if (vfs_badlock_ddb) 5510 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5511 } 5512 5513 void 5514 assert_vi_locked(struct vnode *vp, const char *str) 5515 { 5516 5517 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5518 vfs_badlock("interlock is not locked but should be", str, vp); 5519 } 5520 5521 void 5522 assert_vi_unlocked(struct vnode *vp, const char *str) 5523 { 5524 5525 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5526 vfs_badlock("interlock is locked but should not be", str, vp); 5527 } 5528 5529 void 5530 assert_vop_locked(struct vnode *vp, const char *str) 5531 { 5532 int locked; 5533 5534 if (KERNEL_PANICKED() || vp == NULL) 5535 return; 5536 5537 locked = VOP_ISLOCKED(vp); 5538 if (locked == 0 || locked == LK_EXCLOTHER) 5539 vfs_badlock("is not locked but should be", str, vp); 5540 } 5541 5542 void 5543 assert_vop_unlocked(struct vnode *vp, const char *str) 5544 { 5545 if (KERNEL_PANICKED() || vp == NULL) 5546 return; 5547 5548 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5549 vfs_badlock("is locked but should not be", str, vp); 5550 } 5551 5552 void 5553 assert_vop_elocked(struct vnode *vp, const char *str) 5554 { 5555 if (KERNEL_PANICKED() || vp == NULL) 5556 return; 5557 5558 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5559 vfs_badlock("is not exclusive locked but should be", str, vp); 5560 } 5561 #endif /* DEBUG_VFS_LOCKS */ 5562 5563 void 5564 vop_rename_fail(struct vop_rename_args *ap) 5565 { 5566 5567 if (ap->a_tvp != NULL) 5568 vput(ap->a_tvp); 5569 if (ap->a_tdvp == ap->a_tvp) 5570 vrele(ap->a_tdvp); 5571 else 5572 vput(ap->a_tdvp); 5573 vrele(ap->a_fdvp); 5574 vrele(ap->a_fvp); 5575 } 5576 5577 void 5578 vop_rename_pre(void *ap) 5579 { 5580 struct vop_rename_args *a = ap; 5581 5582 #ifdef DEBUG_VFS_LOCKS 5583 if (a->a_tvp) 5584 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5585 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5586 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5587 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5588 5589 /* Check the source (from). */ 5590 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5591 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5592 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5593 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5594 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5595 5596 /* Check the target. */ 5597 if (a->a_tvp) 5598 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5599 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5600 #endif 5601 /* 5602 * It may be tempting to add vn_seqc_write_begin/end calls here and 5603 * in vop_rename_post but that's not going to work out since some 5604 * filesystems relookup vnodes mid-rename. This is probably a bug. 5605 * 5606 * For now filesystems are expected to do the relevant calls after they 5607 * decide what vnodes to operate on. 5608 */ 5609 if (a->a_tdvp != a->a_fdvp) 5610 vhold(a->a_fdvp); 5611 if (a->a_tvp != a->a_fvp) 5612 vhold(a->a_fvp); 5613 vhold(a->a_tdvp); 5614 if (a->a_tvp) 5615 vhold(a->a_tvp); 5616 } 5617 5618 #ifdef DEBUG_VFS_LOCKS 5619 void 5620 vop_fplookup_vexec_debugpre(void *ap __unused) 5621 { 5622 5623 VFS_SMR_ASSERT_ENTERED(); 5624 } 5625 5626 void 5627 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) 5628 { 5629 5630 VFS_SMR_ASSERT_ENTERED(); 5631 } 5632 5633 void 5634 vop_fplookup_symlink_debugpre(void *ap __unused) 5635 { 5636 5637 VFS_SMR_ASSERT_ENTERED(); 5638 } 5639 5640 void 5641 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) 5642 { 5643 5644 VFS_SMR_ASSERT_ENTERED(); 5645 } 5646 5647 static void 5648 vop_fsync_debugprepost(struct vnode *vp, const char *name) 5649 { 5650 if (vp->v_type == VCHR) 5651 ; 5652 else if (MNT_EXTENDED_SHARED(vp->v_mount)) 5653 ASSERT_VOP_LOCKED(vp, name); 5654 else 5655 ASSERT_VOP_ELOCKED(vp, name); 5656 } 5657 5658 void 5659 vop_fsync_debugpre(void *a) 5660 { 5661 struct vop_fsync_args *ap; 5662 5663 ap = a; 5664 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5665 } 5666 5667 void 5668 vop_fsync_debugpost(void *a, int rc __unused) 5669 { 5670 struct vop_fsync_args *ap; 5671 5672 ap = a; 5673 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5674 } 5675 5676 void 5677 vop_fdatasync_debugpre(void *a) 5678 { 5679 struct vop_fdatasync_args *ap; 5680 5681 ap = a; 5682 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5683 } 5684 5685 void 5686 vop_fdatasync_debugpost(void *a, int rc __unused) 5687 { 5688 struct vop_fdatasync_args *ap; 5689 5690 ap = a; 5691 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5692 } 5693 5694 void 5695 vop_strategy_debugpre(void *ap) 5696 { 5697 struct vop_strategy_args *a; 5698 struct buf *bp; 5699 5700 a = ap; 5701 bp = a->a_bp; 5702 5703 /* 5704 * Cluster ops lock their component buffers but not the IO container. 5705 */ 5706 if ((bp->b_flags & B_CLUSTER) != 0) 5707 return; 5708 5709 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 5710 if (vfs_badlock_print) 5711 printf( 5712 "VOP_STRATEGY: bp is not locked but should be\n"); 5713 if (vfs_badlock_ddb) 5714 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5715 } 5716 } 5717 5718 void 5719 vop_lock_debugpre(void *ap) 5720 { 5721 struct vop_lock1_args *a = ap; 5722 5723 if ((a->a_flags & LK_INTERLOCK) == 0) 5724 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5725 else 5726 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 5727 } 5728 5729 void 5730 vop_lock_debugpost(void *ap, int rc) 5731 { 5732 struct vop_lock1_args *a = ap; 5733 5734 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5735 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 5736 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 5737 } 5738 5739 void 5740 vop_unlock_debugpre(void *ap) 5741 { 5742 struct vop_unlock_args *a = ap; 5743 struct vnode *vp = a->a_vp; 5744 5745 VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp); 5746 ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK"); 5747 } 5748 5749 void 5750 vop_need_inactive_debugpre(void *ap) 5751 { 5752 struct vop_need_inactive_args *a = ap; 5753 5754 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5755 } 5756 5757 void 5758 vop_need_inactive_debugpost(void *ap, int rc) 5759 { 5760 struct vop_need_inactive_args *a = ap; 5761 5762 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5763 } 5764 #endif 5765 5766 void 5767 vop_create_pre(void *ap) 5768 { 5769 struct vop_create_args *a; 5770 struct vnode *dvp; 5771 5772 a = ap; 5773 dvp = a->a_dvp; 5774 vn_seqc_write_begin(dvp); 5775 } 5776 5777 void 5778 vop_create_post(void *ap, int rc) 5779 { 5780 struct vop_create_args *a; 5781 struct vnode *dvp; 5782 5783 a = ap; 5784 dvp = a->a_dvp; 5785 vn_seqc_write_end(dvp); 5786 if (!rc) 5787 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5788 } 5789 5790 void 5791 vop_whiteout_pre(void *ap) 5792 { 5793 struct vop_whiteout_args *a; 5794 struct vnode *dvp; 5795 5796 a = ap; 5797 dvp = a->a_dvp; 5798 vn_seqc_write_begin(dvp); 5799 } 5800 5801 void 5802 vop_whiteout_post(void *ap, int rc) 5803 { 5804 struct vop_whiteout_args *a; 5805 struct vnode *dvp; 5806 5807 a = ap; 5808 dvp = a->a_dvp; 5809 vn_seqc_write_end(dvp); 5810 } 5811 5812 void 5813 vop_deleteextattr_pre(void *ap) 5814 { 5815 struct vop_deleteextattr_args *a; 5816 struct vnode *vp; 5817 5818 a = ap; 5819 vp = a->a_vp; 5820 vn_seqc_write_begin(vp); 5821 } 5822 5823 void 5824 vop_deleteextattr_post(void *ap, int rc) 5825 { 5826 struct vop_deleteextattr_args *a; 5827 struct vnode *vp; 5828 5829 a = ap; 5830 vp = a->a_vp; 5831 vn_seqc_write_end(vp); 5832 if (!rc) 5833 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5834 } 5835 5836 void 5837 vop_link_pre(void *ap) 5838 { 5839 struct vop_link_args *a; 5840 struct vnode *vp, *tdvp; 5841 5842 a = ap; 5843 vp = a->a_vp; 5844 tdvp = a->a_tdvp; 5845 vn_seqc_write_begin(vp); 5846 vn_seqc_write_begin(tdvp); 5847 } 5848 5849 void 5850 vop_link_post(void *ap, int rc) 5851 { 5852 struct vop_link_args *a; 5853 struct vnode *vp, *tdvp; 5854 5855 a = ap; 5856 vp = a->a_vp; 5857 tdvp = a->a_tdvp; 5858 vn_seqc_write_end(vp); 5859 vn_seqc_write_end(tdvp); 5860 if (!rc) { 5861 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 5862 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 5863 } 5864 } 5865 5866 void 5867 vop_mkdir_pre(void *ap) 5868 { 5869 struct vop_mkdir_args *a; 5870 struct vnode *dvp; 5871 5872 a = ap; 5873 dvp = a->a_dvp; 5874 vn_seqc_write_begin(dvp); 5875 } 5876 5877 void 5878 vop_mkdir_post(void *ap, int rc) 5879 { 5880 struct vop_mkdir_args *a; 5881 struct vnode *dvp; 5882 5883 a = ap; 5884 dvp = a->a_dvp; 5885 vn_seqc_write_end(dvp); 5886 if (!rc) 5887 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5888 } 5889 5890 #ifdef DEBUG_VFS_LOCKS 5891 void 5892 vop_mkdir_debugpost(void *ap, int rc) 5893 { 5894 struct vop_mkdir_args *a; 5895 5896 a = ap; 5897 if (!rc) 5898 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); 5899 } 5900 #endif 5901 5902 void 5903 vop_mknod_pre(void *ap) 5904 { 5905 struct vop_mknod_args *a; 5906 struct vnode *dvp; 5907 5908 a = ap; 5909 dvp = a->a_dvp; 5910 vn_seqc_write_begin(dvp); 5911 } 5912 5913 void 5914 vop_mknod_post(void *ap, int rc) 5915 { 5916 struct vop_mknod_args *a; 5917 struct vnode *dvp; 5918 5919 a = ap; 5920 dvp = a->a_dvp; 5921 vn_seqc_write_end(dvp); 5922 if (!rc) 5923 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5924 } 5925 5926 void 5927 vop_reclaim_post(void *ap, int rc) 5928 { 5929 struct vop_reclaim_args *a; 5930 struct vnode *vp; 5931 5932 a = ap; 5933 vp = a->a_vp; 5934 ASSERT_VOP_IN_SEQC(vp); 5935 if (!rc) 5936 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 5937 } 5938 5939 void 5940 vop_remove_pre(void *ap) 5941 { 5942 struct vop_remove_args *a; 5943 struct vnode *dvp, *vp; 5944 5945 a = ap; 5946 dvp = a->a_dvp; 5947 vp = a->a_vp; 5948 vn_seqc_write_begin(dvp); 5949 vn_seqc_write_begin(vp); 5950 } 5951 5952 void 5953 vop_remove_post(void *ap, int rc) 5954 { 5955 struct vop_remove_args *a; 5956 struct vnode *dvp, *vp; 5957 5958 a = ap; 5959 dvp = a->a_dvp; 5960 vp = a->a_vp; 5961 vn_seqc_write_end(dvp); 5962 vn_seqc_write_end(vp); 5963 if (!rc) { 5964 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5965 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5966 } 5967 } 5968 5969 void 5970 vop_rename_post(void *ap, int rc) 5971 { 5972 struct vop_rename_args *a = ap; 5973 long hint; 5974 5975 if (!rc) { 5976 hint = NOTE_WRITE; 5977 if (a->a_fdvp == a->a_tdvp) { 5978 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 5979 hint |= NOTE_LINK; 5980 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5981 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5982 } else { 5983 hint |= NOTE_EXTEND; 5984 if (a->a_fvp->v_type == VDIR) 5985 hint |= NOTE_LINK; 5986 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5987 5988 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 5989 a->a_tvp->v_type == VDIR) 5990 hint &= ~NOTE_LINK; 5991 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5992 } 5993 5994 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 5995 if (a->a_tvp) 5996 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 5997 } 5998 if (a->a_tdvp != a->a_fdvp) 5999 vdrop(a->a_fdvp); 6000 if (a->a_tvp != a->a_fvp) 6001 vdrop(a->a_fvp); 6002 vdrop(a->a_tdvp); 6003 if (a->a_tvp) 6004 vdrop(a->a_tvp); 6005 } 6006 6007 void 6008 vop_rmdir_pre(void *ap) 6009 { 6010 struct vop_rmdir_args *a; 6011 struct vnode *dvp, *vp; 6012 6013 a = ap; 6014 dvp = a->a_dvp; 6015 vp = a->a_vp; 6016 vn_seqc_write_begin(dvp); 6017 vn_seqc_write_begin(vp); 6018 } 6019 6020 void 6021 vop_rmdir_post(void *ap, int rc) 6022 { 6023 struct vop_rmdir_args *a; 6024 struct vnode *dvp, *vp; 6025 6026 a = ap; 6027 dvp = a->a_dvp; 6028 vp = a->a_vp; 6029 vn_seqc_write_end(dvp); 6030 vn_seqc_write_end(vp); 6031 if (!rc) { 6032 vp->v_vflag |= VV_UNLINKED; 6033 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6034 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6035 } 6036 } 6037 6038 void 6039 vop_setattr_pre(void *ap) 6040 { 6041 struct vop_setattr_args *a; 6042 struct vnode *vp; 6043 6044 a = ap; 6045 vp = a->a_vp; 6046 vn_seqc_write_begin(vp); 6047 } 6048 6049 void 6050 vop_setattr_post(void *ap, int rc) 6051 { 6052 struct vop_setattr_args *a; 6053 struct vnode *vp; 6054 6055 a = ap; 6056 vp = a->a_vp; 6057 vn_seqc_write_end(vp); 6058 if (!rc) 6059 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6060 } 6061 6062 void 6063 vop_setacl_pre(void *ap) 6064 { 6065 struct vop_setacl_args *a; 6066 struct vnode *vp; 6067 6068 a = ap; 6069 vp = a->a_vp; 6070 vn_seqc_write_begin(vp); 6071 } 6072 6073 void 6074 vop_setacl_post(void *ap, int rc __unused) 6075 { 6076 struct vop_setacl_args *a; 6077 struct vnode *vp; 6078 6079 a = ap; 6080 vp = a->a_vp; 6081 vn_seqc_write_end(vp); 6082 } 6083 6084 void 6085 vop_setextattr_pre(void *ap) 6086 { 6087 struct vop_setextattr_args *a; 6088 struct vnode *vp; 6089 6090 a = ap; 6091 vp = a->a_vp; 6092 vn_seqc_write_begin(vp); 6093 } 6094 6095 void 6096 vop_setextattr_post(void *ap, int rc) 6097 { 6098 struct vop_setextattr_args *a; 6099 struct vnode *vp; 6100 6101 a = ap; 6102 vp = a->a_vp; 6103 vn_seqc_write_end(vp); 6104 if (!rc) 6105 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6106 } 6107 6108 void 6109 vop_symlink_pre(void *ap) 6110 { 6111 struct vop_symlink_args *a; 6112 struct vnode *dvp; 6113 6114 a = ap; 6115 dvp = a->a_dvp; 6116 vn_seqc_write_begin(dvp); 6117 } 6118 6119 void 6120 vop_symlink_post(void *ap, int rc) 6121 { 6122 struct vop_symlink_args *a; 6123 struct vnode *dvp; 6124 6125 a = ap; 6126 dvp = a->a_dvp; 6127 vn_seqc_write_end(dvp); 6128 if (!rc) 6129 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6130 } 6131 6132 void 6133 vop_open_post(void *ap, int rc) 6134 { 6135 struct vop_open_args *a = ap; 6136 6137 if (!rc) 6138 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6139 } 6140 6141 void 6142 vop_close_post(void *ap, int rc) 6143 { 6144 struct vop_close_args *a = ap; 6145 6146 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6147 !VN_IS_DOOMED(a->a_vp))) { 6148 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6149 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6150 } 6151 } 6152 6153 void 6154 vop_read_post(void *ap, int rc) 6155 { 6156 struct vop_read_args *a = ap; 6157 6158 if (!rc) 6159 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6160 } 6161 6162 void 6163 vop_read_pgcache_post(void *ap, int rc) 6164 { 6165 struct vop_read_pgcache_args *a = ap; 6166 6167 if (!rc) 6168 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); 6169 } 6170 6171 void 6172 vop_readdir_post(void *ap, int rc) 6173 { 6174 struct vop_readdir_args *a = ap; 6175 6176 if (!rc) 6177 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6178 } 6179 6180 static struct knlist fs_knlist; 6181 6182 static void 6183 vfs_event_init(void *arg) 6184 { 6185 knlist_init_mtx(&fs_knlist, NULL); 6186 } 6187 /* XXX - correct order? */ 6188 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6189 6190 void 6191 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6192 { 6193 6194 KNOTE_UNLOCKED(&fs_knlist, event); 6195 } 6196 6197 static int filt_fsattach(struct knote *kn); 6198 static void filt_fsdetach(struct knote *kn); 6199 static int filt_fsevent(struct knote *kn, long hint); 6200 6201 struct filterops fs_filtops = { 6202 .f_isfd = 0, 6203 .f_attach = filt_fsattach, 6204 .f_detach = filt_fsdetach, 6205 .f_event = filt_fsevent 6206 }; 6207 6208 static int 6209 filt_fsattach(struct knote *kn) 6210 { 6211 6212 kn->kn_flags |= EV_CLEAR; 6213 knlist_add(&fs_knlist, kn, 0); 6214 return (0); 6215 } 6216 6217 static void 6218 filt_fsdetach(struct knote *kn) 6219 { 6220 6221 knlist_remove(&fs_knlist, kn, 0); 6222 } 6223 6224 static int 6225 filt_fsevent(struct knote *kn, long hint) 6226 { 6227 6228 kn->kn_fflags |= kn->kn_sfflags & hint; 6229 6230 return (kn->kn_fflags != 0); 6231 } 6232 6233 static int 6234 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6235 { 6236 struct vfsidctl vc; 6237 int error; 6238 struct mount *mp; 6239 6240 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6241 if (error) 6242 return (error); 6243 if (vc.vc_vers != VFS_CTL_VERS1) 6244 return (EINVAL); 6245 mp = vfs_getvfs(&vc.vc_fsid); 6246 if (mp == NULL) 6247 return (ENOENT); 6248 /* ensure that a specific sysctl goes to the right filesystem. */ 6249 if (strcmp(vc.vc_fstypename, "*") != 0 && 6250 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6251 vfs_rel(mp); 6252 return (EINVAL); 6253 } 6254 VCTLTOREQ(&vc, req); 6255 error = VFS_SYSCTL(mp, vc.vc_op, req); 6256 vfs_rel(mp); 6257 return (error); 6258 } 6259 6260 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6261 NULL, 0, sysctl_vfs_ctl, "", 6262 "Sysctl by fsid"); 6263 6264 /* 6265 * Function to initialize a va_filerev field sensibly. 6266 * XXX: Wouldn't a random number make a lot more sense ?? 6267 */ 6268 u_quad_t 6269 init_va_filerev(void) 6270 { 6271 struct bintime bt; 6272 6273 getbinuptime(&bt); 6274 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6275 } 6276 6277 static int filt_vfsread(struct knote *kn, long hint); 6278 static int filt_vfswrite(struct knote *kn, long hint); 6279 static int filt_vfsvnode(struct knote *kn, long hint); 6280 static void filt_vfsdetach(struct knote *kn); 6281 static struct filterops vfsread_filtops = { 6282 .f_isfd = 1, 6283 .f_detach = filt_vfsdetach, 6284 .f_event = filt_vfsread 6285 }; 6286 static struct filterops vfswrite_filtops = { 6287 .f_isfd = 1, 6288 .f_detach = filt_vfsdetach, 6289 .f_event = filt_vfswrite 6290 }; 6291 static struct filterops vfsvnode_filtops = { 6292 .f_isfd = 1, 6293 .f_detach = filt_vfsdetach, 6294 .f_event = filt_vfsvnode 6295 }; 6296 6297 static void 6298 vfs_knllock(void *arg) 6299 { 6300 struct vnode *vp = arg; 6301 6302 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6303 } 6304 6305 static void 6306 vfs_knlunlock(void *arg) 6307 { 6308 struct vnode *vp = arg; 6309 6310 VOP_UNLOCK(vp); 6311 } 6312 6313 static void 6314 vfs_knl_assert_lock(void *arg, int what) 6315 { 6316 #ifdef DEBUG_VFS_LOCKS 6317 struct vnode *vp = arg; 6318 6319 if (what == LA_LOCKED) 6320 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6321 else 6322 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6323 #endif 6324 } 6325 6326 int 6327 vfs_kqfilter(struct vop_kqfilter_args *ap) 6328 { 6329 struct vnode *vp = ap->a_vp; 6330 struct knote *kn = ap->a_kn; 6331 struct knlist *knl; 6332 6333 KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && 6334 kn->kn_filter != EVFILT_WRITE), 6335 ("READ/WRITE filter on a FIFO leaked through")); 6336 switch (kn->kn_filter) { 6337 case EVFILT_READ: 6338 kn->kn_fop = &vfsread_filtops; 6339 break; 6340 case EVFILT_WRITE: 6341 kn->kn_fop = &vfswrite_filtops; 6342 break; 6343 case EVFILT_VNODE: 6344 kn->kn_fop = &vfsvnode_filtops; 6345 break; 6346 default: 6347 return (EINVAL); 6348 } 6349 6350 kn->kn_hook = (caddr_t)vp; 6351 6352 v_addpollinfo(vp); 6353 if (vp->v_pollinfo == NULL) 6354 return (ENOMEM); 6355 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6356 vhold(vp); 6357 knlist_add(knl, kn, 0); 6358 6359 return (0); 6360 } 6361 6362 /* 6363 * Detach knote from vnode 6364 */ 6365 static void 6366 filt_vfsdetach(struct knote *kn) 6367 { 6368 struct vnode *vp = (struct vnode *)kn->kn_hook; 6369 6370 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6371 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6372 vdrop(vp); 6373 } 6374 6375 /*ARGSUSED*/ 6376 static int 6377 filt_vfsread(struct knote *kn, long hint) 6378 { 6379 struct vnode *vp = (struct vnode *)kn->kn_hook; 6380 off_t size; 6381 int res; 6382 6383 /* 6384 * filesystem is gone, so set the EOF flag and schedule 6385 * the knote for deletion. 6386 */ 6387 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6388 VI_LOCK(vp); 6389 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6390 VI_UNLOCK(vp); 6391 return (1); 6392 } 6393 6394 if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0) 6395 return (0); 6396 6397 VI_LOCK(vp); 6398 kn->kn_data = size - kn->kn_fp->f_offset; 6399 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6400 VI_UNLOCK(vp); 6401 return (res); 6402 } 6403 6404 /*ARGSUSED*/ 6405 static int 6406 filt_vfswrite(struct knote *kn, long hint) 6407 { 6408 struct vnode *vp = (struct vnode *)kn->kn_hook; 6409 6410 VI_LOCK(vp); 6411 6412 /* 6413 * filesystem is gone, so set the EOF flag and schedule 6414 * the knote for deletion. 6415 */ 6416 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6417 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6418 6419 kn->kn_data = 0; 6420 VI_UNLOCK(vp); 6421 return (1); 6422 } 6423 6424 static int 6425 filt_vfsvnode(struct knote *kn, long hint) 6426 { 6427 struct vnode *vp = (struct vnode *)kn->kn_hook; 6428 int res; 6429 6430 VI_LOCK(vp); 6431 if (kn->kn_sfflags & hint) 6432 kn->kn_fflags |= hint; 6433 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6434 kn->kn_flags |= EV_EOF; 6435 VI_UNLOCK(vp); 6436 return (1); 6437 } 6438 res = (kn->kn_fflags != 0); 6439 VI_UNLOCK(vp); 6440 return (res); 6441 } 6442 6443 /* 6444 * Returns whether the directory is empty or not. 6445 * If it is empty, the return value is 0; otherwise 6446 * the return value is an error value (which may 6447 * be ENOTEMPTY). 6448 */ 6449 int 6450 vfs_emptydir(struct vnode *vp) 6451 { 6452 struct uio uio; 6453 struct iovec iov; 6454 struct dirent *dirent, *dp, *endp; 6455 int error, eof; 6456 6457 error = 0; 6458 eof = 0; 6459 6460 ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); 6461 VNASSERT(vp->v_type == VDIR, vp, ("vp is not a directory")); 6462 6463 dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); 6464 iov.iov_base = dirent; 6465 iov.iov_len = sizeof(struct dirent); 6466 6467 uio.uio_iov = &iov; 6468 uio.uio_iovcnt = 1; 6469 uio.uio_offset = 0; 6470 uio.uio_resid = sizeof(struct dirent); 6471 uio.uio_segflg = UIO_SYSSPACE; 6472 uio.uio_rw = UIO_READ; 6473 uio.uio_td = curthread; 6474 6475 while (eof == 0 && error == 0) { 6476 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, 6477 NULL, NULL); 6478 if (error != 0) 6479 break; 6480 endp = (void *)((uint8_t *)dirent + 6481 sizeof(struct dirent) - uio.uio_resid); 6482 for (dp = dirent; dp < endp; 6483 dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { 6484 if (dp->d_type == DT_WHT) 6485 continue; 6486 if (dp->d_namlen == 0) 6487 continue; 6488 if (dp->d_type != DT_DIR && 6489 dp->d_type != DT_UNKNOWN) { 6490 error = ENOTEMPTY; 6491 break; 6492 } 6493 if (dp->d_namlen > 2) { 6494 error = ENOTEMPTY; 6495 break; 6496 } 6497 if (dp->d_namlen == 1 && 6498 dp->d_name[0] != '.') { 6499 error = ENOTEMPTY; 6500 break; 6501 } 6502 if (dp->d_namlen == 2 && 6503 dp->d_name[1] != '.') { 6504 error = ENOTEMPTY; 6505 break; 6506 } 6507 uio.uio_resid = sizeof(struct dirent); 6508 } 6509 } 6510 free(dirent, M_TEMP); 6511 return (error); 6512 } 6513 6514 int 6515 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6516 { 6517 int error; 6518 6519 if (dp->d_reclen > ap->a_uio->uio_resid) 6520 return (ENAMETOOLONG); 6521 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6522 if (error) { 6523 if (ap->a_ncookies != NULL) { 6524 if (ap->a_cookies != NULL) 6525 free(ap->a_cookies, M_TEMP); 6526 ap->a_cookies = NULL; 6527 *ap->a_ncookies = 0; 6528 } 6529 return (error); 6530 } 6531 if (ap->a_ncookies == NULL) 6532 return (0); 6533 6534 KASSERT(ap->a_cookies, 6535 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6536 6537 *ap->a_cookies = realloc(*ap->a_cookies, 6538 (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); 6539 (*ap->a_cookies)[*ap->a_ncookies] = off; 6540 *ap->a_ncookies += 1; 6541 return (0); 6542 } 6543 6544 /* 6545 * The purpose of this routine is to remove granularity from accmode_t, 6546 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6547 * VADMIN and VAPPEND. 6548 * 6549 * If it returns 0, the caller is supposed to continue with the usual 6550 * access checks using 'accmode' as modified by this routine. If it 6551 * returns nonzero value, the caller is supposed to return that value 6552 * as errno. 6553 * 6554 * Note that after this routine runs, accmode may be zero. 6555 */ 6556 int 6557 vfs_unixify_accmode(accmode_t *accmode) 6558 { 6559 /* 6560 * There is no way to specify explicit "deny" rule using 6561 * file mode or POSIX.1e ACLs. 6562 */ 6563 if (*accmode & VEXPLICIT_DENY) { 6564 *accmode = 0; 6565 return (0); 6566 } 6567 6568 /* 6569 * None of these can be translated into usual access bits. 6570 * Also, the common case for NFSv4 ACLs is to not contain 6571 * either of these bits. Caller should check for VWRITE 6572 * on the containing directory instead. 6573 */ 6574 if (*accmode & (VDELETE_CHILD | VDELETE)) 6575 return (EPERM); 6576 6577 if (*accmode & VADMIN_PERMS) { 6578 *accmode &= ~VADMIN_PERMS; 6579 *accmode |= VADMIN; 6580 } 6581 6582 /* 6583 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6584 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6585 */ 6586 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6587 6588 return (0); 6589 } 6590 6591 /* 6592 * Clear out a doomed vnode (if any) and replace it with a new one as long 6593 * as the fs is not being unmounted. Return the root vnode to the caller. 6594 */ 6595 static int __noinline 6596 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6597 { 6598 struct vnode *vp; 6599 int error; 6600 6601 restart: 6602 if (mp->mnt_rootvnode != NULL) { 6603 MNT_ILOCK(mp); 6604 vp = mp->mnt_rootvnode; 6605 if (vp != NULL) { 6606 if (!VN_IS_DOOMED(vp)) { 6607 vrefact(vp); 6608 MNT_IUNLOCK(mp); 6609 error = vn_lock(vp, flags); 6610 if (error == 0) { 6611 *vpp = vp; 6612 return (0); 6613 } 6614 vrele(vp); 6615 goto restart; 6616 } 6617 /* 6618 * Clear the old one. 6619 */ 6620 mp->mnt_rootvnode = NULL; 6621 } 6622 MNT_IUNLOCK(mp); 6623 if (vp != NULL) { 6624 vfs_op_barrier_wait(mp); 6625 vrele(vp); 6626 } 6627 } 6628 error = VFS_CACHEDROOT(mp, flags, vpp); 6629 if (error != 0) 6630 return (error); 6631 if (mp->mnt_vfs_ops == 0) { 6632 MNT_ILOCK(mp); 6633 if (mp->mnt_vfs_ops != 0) { 6634 MNT_IUNLOCK(mp); 6635 return (0); 6636 } 6637 if (mp->mnt_rootvnode == NULL) { 6638 vrefact(*vpp); 6639 mp->mnt_rootvnode = *vpp; 6640 } else { 6641 if (mp->mnt_rootvnode != *vpp) { 6642 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6643 panic("%s: mismatch between vnode returned " 6644 " by VFS_CACHEDROOT and the one cached " 6645 " (%p != %p)", 6646 __func__, *vpp, mp->mnt_rootvnode); 6647 } 6648 } 6649 } 6650 MNT_IUNLOCK(mp); 6651 } 6652 return (0); 6653 } 6654 6655 int 6656 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6657 { 6658 struct mount_pcpu *mpcpu; 6659 struct vnode *vp; 6660 int error; 6661 6662 if (!vfs_op_thread_enter(mp, mpcpu)) 6663 return (vfs_cache_root_fallback(mp, flags, vpp)); 6664 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6665 if (vp == NULL || VN_IS_DOOMED(vp)) { 6666 vfs_op_thread_exit(mp, mpcpu); 6667 return (vfs_cache_root_fallback(mp, flags, vpp)); 6668 } 6669 vrefact(vp); 6670 vfs_op_thread_exit(mp, mpcpu); 6671 error = vn_lock(vp, flags); 6672 if (error != 0) { 6673 vrele(vp); 6674 return (vfs_cache_root_fallback(mp, flags, vpp)); 6675 } 6676 *vpp = vp; 6677 return (0); 6678 } 6679 6680 struct vnode * 6681 vfs_cache_root_clear(struct mount *mp) 6682 { 6683 struct vnode *vp; 6684 6685 /* 6686 * ops > 0 guarantees there is nobody who can see this vnode 6687 */ 6688 MPASS(mp->mnt_vfs_ops > 0); 6689 vp = mp->mnt_rootvnode; 6690 if (vp != NULL) 6691 vn_seqc_write_begin(vp); 6692 mp->mnt_rootvnode = NULL; 6693 return (vp); 6694 } 6695 6696 void 6697 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 6698 { 6699 6700 MPASS(mp->mnt_vfs_ops > 0); 6701 vrefact(vp); 6702 mp->mnt_rootvnode = vp; 6703 } 6704 6705 /* 6706 * These are helper functions for filesystems to traverse all 6707 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 6708 * 6709 * This interface replaces MNT_VNODE_FOREACH. 6710 */ 6711 6712 struct vnode * 6713 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 6714 { 6715 struct vnode *vp; 6716 6717 if (should_yield()) 6718 kern_yield(PRI_USER); 6719 MNT_ILOCK(mp); 6720 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6721 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 6722 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 6723 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6724 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6725 continue; 6726 VI_LOCK(vp); 6727 if (VN_IS_DOOMED(vp)) { 6728 VI_UNLOCK(vp); 6729 continue; 6730 } 6731 break; 6732 } 6733 if (vp == NULL) { 6734 __mnt_vnode_markerfree_all(mvp, mp); 6735 /* MNT_IUNLOCK(mp); -- done in above function */ 6736 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 6737 return (NULL); 6738 } 6739 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6740 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6741 MNT_IUNLOCK(mp); 6742 return (vp); 6743 } 6744 6745 struct vnode * 6746 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 6747 { 6748 struct vnode *vp; 6749 6750 *mvp = vn_alloc_marker(mp); 6751 MNT_ILOCK(mp); 6752 MNT_REF(mp); 6753 6754 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 6755 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6756 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6757 continue; 6758 VI_LOCK(vp); 6759 if (VN_IS_DOOMED(vp)) { 6760 VI_UNLOCK(vp); 6761 continue; 6762 } 6763 break; 6764 } 6765 if (vp == NULL) { 6766 MNT_REL(mp); 6767 MNT_IUNLOCK(mp); 6768 vn_free_marker(*mvp); 6769 *mvp = NULL; 6770 return (NULL); 6771 } 6772 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6773 MNT_IUNLOCK(mp); 6774 return (vp); 6775 } 6776 6777 void 6778 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 6779 { 6780 6781 if (*mvp == NULL) { 6782 MNT_IUNLOCK(mp); 6783 return; 6784 } 6785 6786 mtx_assert(MNT_MTX(mp), MA_OWNED); 6787 6788 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6789 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6790 MNT_REL(mp); 6791 MNT_IUNLOCK(mp); 6792 vn_free_marker(*mvp); 6793 *mvp = NULL; 6794 } 6795 6796 /* 6797 * These are helper functions for filesystems to traverse their 6798 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 6799 */ 6800 static void 6801 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6802 { 6803 6804 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6805 6806 MNT_ILOCK(mp); 6807 MNT_REL(mp); 6808 MNT_IUNLOCK(mp); 6809 vn_free_marker(*mvp); 6810 *mvp = NULL; 6811 } 6812 6813 /* 6814 * Relock the mp mount vnode list lock with the vp vnode interlock in the 6815 * conventional lock order during mnt_vnode_next_lazy iteration. 6816 * 6817 * On entry, the mount vnode list lock is held and the vnode interlock is not. 6818 * The list lock is dropped and reacquired. On success, both locks are held. 6819 * On failure, the mount vnode list lock is held but the vnode interlock is 6820 * not, and the procedure may have yielded. 6821 */ 6822 static bool 6823 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 6824 struct vnode *vp) 6825 { 6826 6827 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 6828 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 6829 ("%s: bad marker", __func__)); 6830 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 6831 ("%s: inappropriate vnode", __func__)); 6832 ASSERT_VI_UNLOCKED(vp, __func__); 6833 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6834 6835 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 6836 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 6837 6838 /* 6839 * Note we may be racing against vdrop which transitioned the hold 6840 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 6841 * if we are the only user after we get the interlock we will just 6842 * vdrop. 6843 */ 6844 vhold(vp); 6845 mtx_unlock(&mp->mnt_listmtx); 6846 VI_LOCK(vp); 6847 if (VN_IS_DOOMED(vp)) { 6848 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 6849 goto out_lost; 6850 } 6851 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 6852 /* 6853 * There is nothing to do if we are the last user. 6854 */ 6855 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 6856 goto out_lost; 6857 mtx_lock(&mp->mnt_listmtx); 6858 return (true); 6859 out_lost: 6860 vdropl(vp); 6861 maybe_yield(); 6862 mtx_lock(&mp->mnt_listmtx); 6863 return (false); 6864 } 6865 6866 static struct vnode * 6867 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6868 void *cbarg) 6869 { 6870 struct vnode *vp; 6871 6872 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6873 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6874 restart: 6875 vp = TAILQ_NEXT(*mvp, v_lazylist); 6876 while (vp != NULL) { 6877 if (vp->v_type == VMARKER) { 6878 vp = TAILQ_NEXT(vp, v_lazylist); 6879 continue; 6880 } 6881 /* 6882 * See if we want to process the vnode. Note we may encounter a 6883 * long string of vnodes we don't care about and hog the list 6884 * as a result. Check for it and requeue the marker. 6885 */ 6886 VNPASS(!VN_IS_DOOMED(vp), vp); 6887 if (!cb(vp, cbarg)) { 6888 if (!should_yield()) { 6889 vp = TAILQ_NEXT(vp, v_lazylist); 6890 continue; 6891 } 6892 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 6893 v_lazylist); 6894 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 6895 v_lazylist); 6896 mtx_unlock(&mp->mnt_listmtx); 6897 kern_yield(PRI_USER); 6898 mtx_lock(&mp->mnt_listmtx); 6899 goto restart; 6900 } 6901 /* 6902 * Try-lock because this is the wrong lock order. 6903 */ 6904 if (!VI_TRYLOCK(vp) && 6905 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 6906 goto restart; 6907 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 6908 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 6909 ("alien vnode on the lazy list %p %p", vp, mp)); 6910 VNPASS(vp->v_mount == mp, vp); 6911 VNPASS(!VN_IS_DOOMED(vp), vp); 6912 break; 6913 } 6914 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6915 6916 /* Check if we are done */ 6917 if (vp == NULL) { 6918 mtx_unlock(&mp->mnt_listmtx); 6919 mnt_vnode_markerfree_lazy(mvp, mp); 6920 return (NULL); 6921 } 6922 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 6923 mtx_unlock(&mp->mnt_listmtx); 6924 ASSERT_VI_LOCKED(vp, "lazy iter"); 6925 return (vp); 6926 } 6927 6928 struct vnode * 6929 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6930 void *cbarg) 6931 { 6932 6933 if (should_yield()) 6934 kern_yield(PRI_USER); 6935 mtx_lock(&mp->mnt_listmtx); 6936 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6937 } 6938 6939 struct vnode * 6940 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6941 void *cbarg) 6942 { 6943 struct vnode *vp; 6944 6945 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 6946 return (NULL); 6947 6948 *mvp = vn_alloc_marker(mp); 6949 MNT_ILOCK(mp); 6950 MNT_REF(mp); 6951 MNT_IUNLOCK(mp); 6952 6953 mtx_lock(&mp->mnt_listmtx); 6954 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 6955 if (vp == NULL) { 6956 mtx_unlock(&mp->mnt_listmtx); 6957 mnt_vnode_markerfree_lazy(mvp, mp); 6958 return (NULL); 6959 } 6960 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 6961 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6962 } 6963 6964 void 6965 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6966 { 6967 6968 if (*mvp == NULL) 6969 return; 6970 6971 mtx_lock(&mp->mnt_listmtx); 6972 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6973 mtx_unlock(&mp->mnt_listmtx); 6974 mnt_vnode_markerfree_lazy(mvp, mp); 6975 } 6976 6977 int 6978 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 6979 { 6980 6981 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 6982 cnp->cn_flags &= ~NOEXECCHECK; 6983 return (0); 6984 } 6985 6986 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); 6987 } 6988 6989 /* 6990 * Do not use this variant unless you have means other than the hold count 6991 * to prevent the vnode from getting freed. 6992 */ 6993 void 6994 vn_seqc_write_begin_locked(struct vnode *vp) 6995 { 6996 6997 ASSERT_VI_LOCKED(vp, __func__); 6998 VNPASS(vp->v_holdcnt > 0, vp); 6999 VNPASS(vp->v_seqc_users >= 0, vp); 7000 vp->v_seqc_users++; 7001 if (vp->v_seqc_users == 1) 7002 seqc_sleepable_write_begin(&vp->v_seqc); 7003 } 7004 7005 void 7006 vn_seqc_write_begin(struct vnode *vp) 7007 { 7008 7009 VI_LOCK(vp); 7010 vn_seqc_write_begin_locked(vp); 7011 VI_UNLOCK(vp); 7012 } 7013 7014 void 7015 vn_seqc_write_end_locked(struct vnode *vp) 7016 { 7017 7018 ASSERT_VI_LOCKED(vp, __func__); 7019 VNPASS(vp->v_seqc_users > 0, vp); 7020 vp->v_seqc_users--; 7021 if (vp->v_seqc_users == 0) 7022 seqc_sleepable_write_end(&vp->v_seqc); 7023 } 7024 7025 void 7026 vn_seqc_write_end(struct vnode *vp) 7027 { 7028 7029 VI_LOCK(vp); 7030 vn_seqc_write_end_locked(vp); 7031 VI_UNLOCK(vp); 7032 } 7033 7034 /* 7035 * Special case handling for allocating and freeing vnodes. 7036 * 7037 * The counter remains unchanged on free so that a doomed vnode will 7038 * keep testing as in modify as long as it is accessible with SMR. 7039 */ 7040 static void 7041 vn_seqc_init(struct vnode *vp) 7042 { 7043 7044 vp->v_seqc = 0; 7045 vp->v_seqc_users = 0; 7046 } 7047 7048 static void 7049 vn_seqc_write_end_free(struct vnode *vp) 7050 { 7051 7052 VNPASS(seqc_in_modify(vp->v_seqc), vp); 7053 VNPASS(vp->v_seqc_users == 1, vp); 7054 } 7055 7056 void 7057 vn_irflag_set_locked(struct vnode *vp, short toset) 7058 { 7059 short flags; 7060 7061 ASSERT_VI_LOCKED(vp, __func__); 7062 flags = vn_irflag_read(vp); 7063 VNASSERT((flags & toset) == 0, vp, 7064 ("%s: some of the passed flags already set (have %d, passed %d)\n", 7065 __func__, flags, toset)); 7066 atomic_store_short(&vp->v_irflag, flags | toset); 7067 } 7068 7069 void 7070 vn_irflag_set(struct vnode *vp, short toset) 7071 { 7072 7073 VI_LOCK(vp); 7074 vn_irflag_set_locked(vp, toset); 7075 VI_UNLOCK(vp); 7076 } 7077 7078 void 7079 vn_irflag_set_cond_locked(struct vnode *vp, short toset) 7080 { 7081 short flags; 7082 7083 ASSERT_VI_LOCKED(vp, __func__); 7084 flags = vn_irflag_read(vp); 7085 atomic_store_short(&vp->v_irflag, flags | toset); 7086 } 7087 7088 void 7089 vn_irflag_set_cond(struct vnode *vp, short toset) 7090 { 7091 7092 VI_LOCK(vp); 7093 vn_irflag_set_cond_locked(vp, toset); 7094 VI_UNLOCK(vp); 7095 } 7096 7097 void 7098 vn_irflag_unset_locked(struct vnode *vp, short tounset) 7099 { 7100 short flags; 7101 7102 ASSERT_VI_LOCKED(vp, __func__); 7103 flags = vn_irflag_read(vp); 7104 VNASSERT((flags & tounset) == tounset, vp, 7105 ("%s: some of the passed flags not set (have %d, passed %d)\n", 7106 __func__, flags, tounset)); 7107 atomic_store_short(&vp->v_irflag, flags & ~tounset); 7108 } 7109 7110 void 7111 vn_irflag_unset(struct vnode *vp, short tounset) 7112 { 7113 7114 VI_LOCK(vp); 7115 vn_irflag_unset_locked(vp, tounset); 7116 VI_UNLOCK(vp); 7117 } 7118 7119 int 7120 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred) 7121 { 7122 struct vattr vattr; 7123 int error; 7124 7125 ASSERT_VOP_LOCKED(vp, __func__); 7126 error = VOP_GETATTR(vp, &vattr, cred); 7127 if (__predict_true(error == 0)) 7128 *size = vattr.va_size; 7129 return (error); 7130 } 7131 7132 int 7133 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred) 7134 { 7135 int error; 7136 7137 VOP_LOCK(vp, LK_SHARED); 7138 error = vn_getsize_locked(vp, size, cred); 7139 VOP_UNLOCK(vp); 7140 return (error); 7141 } 7142 7143 #ifdef INVARIANTS 7144 void 7145 vn_set_state_validate(struct vnode *vp, enum vstate state) 7146 { 7147 7148 switch (vp->v_state) { 7149 case VSTATE_UNINITIALIZED: 7150 switch (state) { 7151 case VSTATE_CONSTRUCTED: 7152 case VSTATE_DESTROYING: 7153 return; 7154 default: 7155 break; 7156 } 7157 break; 7158 case VSTATE_CONSTRUCTED: 7159 ASSERT_VOP_ELOCKED(vp, __func__); 7160 switch (state) { 7161 case VSTATE_DESTROYING: 7162 return; 7163 default: 7164 break; 7165 } 7166 break; 7167 case VSTATE_DESTROYING: 7168 ASSERT_VOP_ELOCKED(vp, __func__); 7169 switch (state) { 7170 case VSTATE_DEAD: 7171 return; 7172 default: 7173 break; 7174 } 7175 break; 7176 case VSTATE_DEAD: 7177 switch (state) { 7178 case VSTATE_UNINITIALIZED: 7179 return; 7180 default: 7181 break; 7182 } 7183 break; 7184 } 7185 7186 vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state); 7187 panic("invalid state transition %d -> %d\n", vp->v_state, state); 7188 } 7189 #endif 7190