1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 37 */ 38 39 /* 40 * External virtual filesystem routines 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_ddb.h" 47 #include "opt_watchdog.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/asan.h> 52 #include <sys/bio.h> 53 #include <sys/buf.h> 54 #include <sys/capsicum.h> 55 #include <sys/condvar.h> 56 #include <sys/conf.h> 57 #include <sys/counter.h> 58 #include <sys/dirent.h> 59 #include <sys/event.h> 60 #include <sys/eventhandler.h> 61 #include <sys/extattr.h> 62 #include <sys/file.h> 63 #include <sys/fcntl.h> 64 #include <sys/jail.h> 65 #include <sys/kdb.h> 66 #include <sys/kernel.h> 67 #include <sys/kthread.h> 68 #include <sys/ktr.h> 69 #include <sys/lockf.h> 70 #include <sys/malloc.h> 71 #include <sys/mount.h> 72 #include <sys/namei.h> 73 #include <sys/pctrie.h> 74 #include <sys/priv.h> 75 #include <sys/reboot.h> 76 #include <sys/refcount.h> 77 #include <sys/rwlock.h> 78 #include <sys/sched.h> 79 #include <sys/sleepqueue.h> 80 #include <sys/smr.h> 81 #include <sys/smp.h> 82 #include <sys/stat.h> 83 #include <sys/sysctl.h> 84 #include <sys/syslog.h> 85 #include <sys/vmmeter.h> 86 #include <sys/vnode.h> 87 #include <sys/watchdog.h> 88 89 #include <machine/stdarg.h> 90 91 #include <security/mac/mac_framework.h> 92 93 #include <vm/vm.h> 94 #include <vm/vm_object.h> 95 #include <vm/vm_extern.h> 96 #include <vm/pmap.h> 97 #include <vm/vm_map.h> 98 #include <vm/vm_page.h> 99 #include <vm/vm_kern.h> 100 #include <vm/uma.h> 101 102 #ifdef DDB 103 #include <ddb/ddb.h> 104 #endif 105 106 static void delmntque(struct vnode *vp); 107 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 108 int slpflag, int slptimeo); 109 static void syncer_shutdown(void *arg, int howto); 110 static int vtryrecycle(struct vnode *vp); 111 static void v_init_counters(struct vnode *); 112 static void vn_seqc_init(struct vnode *); 113 static void vn_seqc_write_end_free(struct vnode *vp); 114 static void vgonel(struct vnode *); 115 static bool vhold_recycle_free(struct vnode *); 116 static void vdropl_recycle(struct vnode *vp); 117 static void vdrop_recycle(struct vnode *vp); 118 static void vfs_knllock(void *arg); 119 static void vfs_knlunlock(void *arg); 120 static void vfs_knl_assert_lock(void *arg, int what); 121 static void destroy_vpollinfo(struct vpollinfo *vi); 122 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 123 daddr_t startlbn, daddr_t endlbn); 124 static void vnlru_recalc(void); 125 126 /* 127 * Number of vnodes in existence. Increased whenever getnewvnode() 128 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 129 */ 130 static u_long __exclusive_cache_line numvnodes; 131 132 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 133 "Number of vnodes in existence"); 134 135 static counter_u64_t vnodes_created; 136 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 137 "Number of vnodes created by getnewvnode"); 138 139 /* 140 * Conversion tables for conversion from vnode types to inode formats 141 * and back. 142 */ 143 enum vtype iftovt_tab[16] = { 144 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 145 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 146 }; 147 int vttoif_tab[10] = { 148 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 149 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 150 }; 151 152 /* 153 * List of allocates vnodes in the system. 154 */ 155 static TAILQ_HEAD(freelst, vnode) vnode_list; 156 static struct vnode *vnode_list_free_marker; 157 static struct vnode *vnode_list_reclaim_marker; 158 159 /* 160 * "Free" vnode target. Free vnodes are rarely completely free, but are 161 * just ones that are cheap to recycle. Usually they are for files which 162 * have been stat'd but not read; these usually have inode and namecache 163 * data attached to them. This target is the preferred minimum size of a 164 * sub-cache consisting mostly of such files. The system balances the size 165 * of this sub-cache with its complement to try to prevent either from 166 * thrashing while the other is relatively inactive. The targets express 167 * a preference for the best balance. 168 * 169 * "Above" this target there are 2 further targets (watermarks) related 170 * to recyling of free vnodes. In the best-operating case, the cache is 171 * exactly full, the free list has size between vlowat and vhiwat above the 172 * free target, and recycling from it and normal use maintains this state. 173 * Sometimes the free list is below vlowat or even empty, but this state 174 * is even better for immediate use provided the cache is not full. 175 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 176 * ones) to reach one of these states. The watermarks are currently hard- 177 * coded as 4% and 9% of the available space higher. These and the default 178 * of 25% for wantfreevnodes are too large if the memory size is large. 179 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 180 * whenever vnlru_proc() becomes active. 181 */ 182 static long wantfreevnodes; 183 static long __exclusive_cache_line freevnodes; 184 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 185 &freevnodes, 0, "Number of \"free\" vnodes"); 186 static long freevnodes_old; 187 188 static counter_u64_t recycles_count; 189 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 190 "Number of vnodes recycled to meet vnode cache targets"); 191 192 static counter_u64_t recycles_free_count; 193 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, 194 "Number of free vnodes recycled to meet vnode cache targets"); 195 196 static counter_u64_t deferred_inact; 197 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, 198 "Number of times inactive processing was deferred"); 199 200 /* To keep more than one thread at a time from running vfs_getnewfsid */ 201 static struct mtx mntid_mtx; 202 203 /* 204 * Lock for any access to the following: 205 * vnode_list 206 * numvnodes 207 * freevnodes 208 */ 209 static struct mtx __exclusive_cache_line vnode_list_mtx; 210 211 /* Publicly exported FS */ 212 struct nfs_public nfs_pub; 213 214 static uma_zone_t buf_trie_zone; 215 static smr_t buf_trie_smr; 216 217 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 218 static uma_zone_t vnode_zone; 219 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); 220 221 __read_frequently smr_t vfs_smr; 222 223 /* 224 * The workitem queue. 225 * 226 * It is useful to delay writes of file data and filesystem metadata 227 * for tens of seconds so that quickly created and deleted files need 228 * not waste disk bandwidth being created and removed. To realize this, 229 * we append vnodes to a "workitem" queue. When running with a soft 230 * updates implementation, most pending metadata dependencies should 231 * not wait for more than a few seconds. Thus, mounted on block devices 232 * are delayed only about a half the time that file data is delayed. 233 * Similarly, directory updates are more critical, so are only delayed 234 * about a third the time that file data is delayed. Thus, there are 235 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 236 * one each second (driven off the filesystem syncer process). The 237 * syncer_delayno variable indicates the next queue that is to be processed. 238 * Items that need to be processed soon are placed in this queue: 239 * 240 * syncer_workitem_pending[syncer_delayno] 241 * 242 * A delay of fifteen seconds is done by placing the request fifteen 243 * entries later in the queue: 244 * 245 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 246 * 247 */ 248 static int syncer_delayno; 249 static long syncer_mask; 250 LIST_HEAD(synclist, bufobj); 251 static struct synclist *syncer_workitem_pending; 252 /* 253 * The sync_mtx protects: 254 * bo->bo_synclist 255 * sync_vnode_count 256 * syncer_delayno 257 * syncer_state 258 * syncer_workitem_pending 259 * syncer_worklist_len 260 * rushjob 261 */ 262 static struct mtx sync_mtx; 263 static struct cv sync_wakeup; 264 265 #define SYNCER_MAXDELAY 32 266 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 267 static int syncdelay = 30; /* max time to delay syncing data */ 268 static int filedelay = 30; /* time to delay syncing files */ 269 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 270 "Time to delay syncing files (in seconds)"); 271 static int dirdelay = 29; /* time to delay syncing directories */ 272 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 273 "Time to delay syncing directories (in seconds)"); 274 static int metadelay = 28; /* time to delay syncing metadata */ 275 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 276 "Time to delay syncing metadata (in seconds)"); 277 static int rushjob; /* number of slots to run ASAP */ 278 static int stat_rush_requests; /* number of times I/O speeded up */ 279 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 280 "Number of times I/O speeded up (rush requests)"); 281 282 #define VDBATCH_SIZE 8 283 struct vdbatch { 284 u_int index; 285 long freevnodes; 286 struct mtx lock; 287 struct vnode *tab[VDBATCH_SIZE]; 288 }; 289 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 290 291 static void vdbatch_dequeue(struct vnode *vp); 292 293 /* 294 * When shutting down the syncer, run it at four times normal speed. 295 */ 296 #define SYNCER_SHUTDOWN_SPEEDUP 4 297 static int sync_vnode_count; 298 static int syncer_worklist_len; 299 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 300 syncer_state; 301 302 /* Target for maximum number of vnodes. */ 303 u_long desiredvnodes; 304 static u_long gapvnodes; /* gap between wanted and desired */ 305 static u_long vhiwat; /* enough extras after expansion */ 306 static u_long vlowat; /* minimal extras before expansion */ 307 static u_long vstir; /* nonzero to stir non-free vnodes */ 308 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 309 310 static u_long vnlru_read_freevnodes(void); 311 312 /* 313 * Note that no attempt is made to sanitize these parameters. 314 */ 315 static int 316 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 317 { 318 u_long val; 319 int error; 320 321 val = desiredvnodes; 322 error = sysctl_handle_long(oidp, &val, 0, req); 323 if (error != 0 || req->newptr == NULL) 324 return (error); 325 326 if (val == desiredvnodes) 327 return (0); 328 mtx_lock(&vnode_list_mtx); 329 desiredvnodes = val; 330 wantfreevnodes = desiredvnodes / 4; 331 vnlru_recalc(); 332 mtx_unlock(&vnode_list_mtx); 333 /* 334 * XXX There is no protection against multiple threads changing 335 * desiredvnodes at the same time. Locking above only helps vnlru and 336 * getnewvnode. 337 */ 338 vfs_hash_changesize(desiredvnodes); 339 cache_changesize(desiredvnodes); 340 return (0); 341 } 342 343 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 344 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 345 "LU", "Target for maximum number of vnodes"); 346 347 static int 348 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 349 { 350 u_long val; 351 int error; 352 353 val = wantfreevnodes; 354 error = sysctl_handle_long(oidp, &val, 0, req); 355 if (error != 0 || req->newptr == NULL) 356 return (error); 357 358 if (val == wantfreevnodes) 359 return (0); 360 mtx_lock(&vnode_list_mtx); 361 wantfreevnodes = val; 362 vnlru_recalc(); 363 mtx_unlock(&vnode_list_mtx); 364 return (0); 365 } 366 367 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 368 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 369 "LU", "Target for minimum number of \"free\" vnodes"); 370 371 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 372 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 373 static int vnlru_nowhere; 374 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW | CTLFLAG_STATS, 375 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 376 377 static int 378 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 379 { 380 struct vnode *vp; 381 struct nameidata nd; 382 char *buf; 383 unsigned long ndflags; 384 int error; 385 386 if (req->newptr == NULL) 387 return (EINVAL); 388 if (req->newlen >= PATH_MAX) 389 return (E2BIG); 390 391 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 392 error = SYSCTL_IN(req, buf, req->newlen); 393 if (error != 0) 394 goto out; 395 396 buf[req->newlen] = '\0'; 397 398 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; 399 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); 400 if ((error = namei(&nd)) != 0) 401 goto out; 402 vp = nd.ni_vp; 403 404 if (VN_IS_DOOMED(vp)) { 405 /* 406 * This vnode is being recycled. Return != 0 to let the caller 407 * know that the sysctl had no effect. Return EAGAIN because a 408 * subsequent call will likely succeed (since namei will create 409 * a new vnode if necessary) 410 */ 411 error = EAGAIN; 412 goto putvnode; 413 } 414 415 counter_u64_add(recycles_count, 1); 416 vgone(vp); 417 putvnode: 418 vput(vp); 419 NDFREE_PNBUF(&nd); 420 out: 421 free(buf, M_TEMP); 422 return (error); 423 } 424 425 static int 426 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 427 { 428 struct thread *td = curthread; 429 struct vnode *vp; 430 struct file *fp; 431 int error; 432 int fd; 433 434 if (req->newptr == NULL) 435 return (EBADF); 436 437 error = sysctl_handle_int(oidp, &fd, 0, req); 438 if (error != 0) 439 return (error); 440 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 441 if (error != 0) 442 return (error); 443 vp = fp->f_vnode; 444 445 error = vn_lock(vp, LK_EXCLUSIVE); 446 if (error != 0) 447 goto drop; 448 449 counter_u64_add(recycles_count, 1); 450 vgone(vp); 451 VOP_UNLOCK(vp); 452 drop: 453 fdrop(fp, td); 454 return (error); 455 } 456 457 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 458 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 459 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 460 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 461 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 462 sysctl_ftry_reclaim_vnode, "I", 463 "Try to reclaim a vnode by its file descriptor"); 464 465 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 466 #define vnsz2log 8 467 #ifndef DEBUG_LOCKS 468 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && 469 sizeof(struct vnode) < 1UL << (vnsz2log + 1), 470 "vnsz2log needs to be updated"); 471 #endif 472 473 /* 474 * Support for the bufobj clean & dirty pctrie. 475 */ 476 static void * 477 buf_trie_alloc(struct pctrie *ptree) 478 { 479 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 480 } 481 482 static void 483 buf_trie_free(struct pctrie *ptree, void *node) 484 { 485 uma_zfree_smr(buf_trie_zone, node); 486 } 487 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 488 buf_trie_smr); 489 490 /* 491 * Initialize the vnode management data structures. 492 * 493 * Reevaluate the following cap on the number of vnodes after the physical 494 * memory size exceeds 512GB. In the limit, as the physical memory size 495 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 496 */ 497 #ifndef MAXVNODES_MAX 498 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 499 #endif 500 501 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 502 503 static struct vnode * 504 vn_alloc_marker(struct mount *mp) 505 { 506 struct vnode *vp; 507 508 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 509 vp->v_type = VMARKER; 510 vp->v_mount = mp; 511 512 return (vp); 513 } 514 515 static void 516 vn_free_marker(struct vnode *vp) 517 { 518 519 MPASS(vp->v_type == VMARKER); 520 free(vp, M_VNODE_MARKER); 521 } 522 523 #ifdef KASAN 524 static int 525 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) 526 { 527 kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); 528 return (0); 529 } 530 531 static void 532 vnode_dtor(void *mem, int size, void *arg __unused) 533 { 534 size_t end1, end2, off1, off2; 535 536 _Static_assert(offsetof(struct vnode, v_vnodelist) < 537 offsetof(struct vnode, v_dbatchcpu), 538 "KASAN marks require updating"); 539 540 off1 = offsetof(struct vnode, v_vnodelist); 541 off2 = offsetof(struct vnode, v_dbatchcpu); 542 end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); 543 end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); 544 545 /* 546 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even 547 * after the vnode has been freed. Try to get some KASAN coverage by 548 * marking everything except those two fields as invalid. Because 549 * KASAN's tracking is not byte-granular, any preceding fields sharing 550 * the same 8-byte aligned word must also be marked valid. 551 */ 552 553 /* Handle the area from the start until v_vnodelist... */ 554 off1 = rounddown2(off1, KASAN_SHADOW_SCALE); 555 kasan_mark(mem, off1, off1, KASAN_UMA_FREED); 556 557 /* ... then the area between v_vnodelist and v_dbatchcpu ... */ 558 off1 = roundup2(end1, KASAN_SHADOW_SCALE); 559 off2 = rounddown2(off2, KASAN_SHADOW_SCALE); 560 if (off2 > off1) 561 kasan_mark((void *)((char *)mem + off1), off2 - off1, 562 off2 - off1, KASAN_UMA_FREED); 563 564 /* ... and finally the area from v_dbatchcpu to the end. */ 565 off2 = roundup2(end2, KASAN_SHADOW_SCALE); 566 kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, 567 KASAN_UMA_FREED); 568 } 569 #endif /* KASAN */ 570 571 /* 572 * Initialize a vnode as it first enters the zone. 573 */ 574 static int 575 vnode_init(void *mem, int size, int flags) 576 { 577 struct vnode *vp; 578 579 vp = mem; 580 bzero(vp, size); 581 /* 582 * Setup locks. 583 */ 584 vp->v_vnlock = &vp->v_lock; 585 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 586 /* 587 * By default, don't allow shared locks unless filesystems opt-in. 588 */ 589 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 590 LK_NOSHARE | LK_IS_VNODE); 591 /* 592 * Initialize bufobj. 593 */ 594 bufobj_init(&vp->v_bufobj, vp); 595 /* 596 * Initialize namecache. 597 */ 598 cache_vnode_init(vp); 599 /* 600 * Initialize rangelocks. 601 */ 602 rangelock_init(&vp->v_rl); 603 604 vp->v_dbatchcpu = NOCPU; 605 606 /* 607 * Check vhold_recycle_free for an explanation. 608 */ 609 vp->v_holdcnt = VHOLD_NO_SMR; 610 vp->v_type = VNON; 611 mtx_lock(&vnode_list_mtx); 612 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 613 mtx_unlock(&vnode_list_mtx); 614 return (0); 615 } 616 617 /* 618 * Free a vnode when it is cleared from the zone. 619 */ 620 static void 621 vnode_fini(void *mem, int size) 622 { 623 struct vnode *vp; 624 struct bufobj *bo; 625 626 vp = mem; 627 vdbatch_dequeue(vp); 628 mtx_lock(&vnode_list_mtx); 629 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 630 mtx_unlock(&vnode_list_mtx); 631 rangelock_destroy(&vp->v_rl); 632 lockdestroy(vp->v_vnlock); 633 mtx_destroy(&vp->v_interlock); 634 bo = &vp->v_bufobj; 635 rw_destroy(BO_LOCKPTR(bo)); 636 637 kasan_mark(mem, size, size, 0); 638 } 639 640 /* 641 * Provide the size of NFS nclnode and NFS fh for calculation of the 642 * vnode memory consumption. The size is specified directly to 643 * eliminate dependency on NFS-private header. 644 * 645 * Other filesystems may use bigger or smaller (like UFS and ZFS) 646 * private inode data, but the NFS-based estimation is ample enough. 647 * Still, we care about differences in the size between 64- and 32-bit 648 * platforms. 649 * 650 * Namecache structure size is heuristically 651 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 652 */ 653 #ifdef _LP64 654 #define NFS_NCLNODE_SZ (528 + 64) 655 #define NC_SZ 148 656 #else 657 #define NFS_NCLNODE_SZ (360 + 32) 658 #define NC_SZ 92 659 #endif 660 661 static void 662 vntblinit(void *dummy __unused) 663 { 664 struct vdbatch *vd; 665 uma_ctor ctor; 666 uma_dtor dtor; 667 int cpu, physvnodes, virtvnodes; 668 669 /* 670 * Desiredvnodes is a function of the physical memory size and the 671 * kernel's heap size. Generally speaking, it scales with the 672 * physical memory size. The ratio of desiredvnodes to the physical 673 * memory size is 1:16 until desiredvnodes exceeds 98,304. 674 * Thereafter, the 675 * marginal ratio of desiredvnodes to the physical memory size is 676 * 1:64. However, desiredvnodes is limited by the kernel's heap 677 * size. The memory required by desiredvnodes vnodes and vm objects 678 * must not exceed 1/10th of the kernel's heap size. 679 */ 680 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 681 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 682 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 683 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 684 desiredvnodes = min(physvnodes, virtvnodes); 685 if (desiredvnodes > MAXVNODES_MAX) { 686 if (bootverbose) 687 printf("Reducing kern.maxvnodes %lu -> %lu\n", 688 desiredvnodes, MAXVNODES_MAX); 689 desiredvnodes = MAXVNODES_MAX; 690 } 691 wantfreevnodes = desiredvnodes / 4; 692 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 693 TAILQ_INIT(&vnode_list); 694 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 695 /* 696 * The lock is taken to appease WITNESS. 697 */ 698 mtx_lock(&vnode_list_mtx); 699 vnlru_recalc(); 700 mtx_unlock(&vnode_list_mtx); 701 vnode_list_free_marker = vn_alloc_marker(NULL); 702 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 703 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 704 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 705 706 #ifdef KASAN 707 ctor = vnode_ctor; 708 dtor = vnode_dtor; 709 #else 710 ctor = NULL; 711 dtor = NULL; 712 #endif 713 vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, 714 vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); 715 uma_zone_set_smr(vnode_zone, vfs_smr); 716 717 /* 718 * Preallocate enough nodes to support one-per buf so that 719 * we can not fail an insert. reassignbuf() callers can not 720 * tolerate the insertion failure. 721 */ 722 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 723 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 724 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 725 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 726 uma_prealloc(buf_trie_zone, nbuf); 727 728 vnodes_created = counter_u64_alloc(M_WAITOK); 729 recycles_count = counter_u64_alloc(M_WAITOK); 730 recycles_free_count = counter_u64_alloc(M_WAITOK); 731 deferred_inact = counter_u64_alloc(M_WAITOK); 732 733 /* 734 * Initialize the filesystem syncer. 735 */ 736 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 737 &syncer_mask); 738 syncer_maxdelay = syncer_mask + 1; 739 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 740 cv_init(&sync_wakeup, "syncer"); 741 742 CPU_FOREACH(cpu) { 743 vd = DPCPU_ID_PTR((cpu), vd); 744 bzero(vd, sizeof(*vd)); 745 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 746 } 747 } 748 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 749 750 /* 751 * Mark a mount point as busy. Used to synchronize access and to delay 752 * unmounting. Eventually, mountlist_mtx is not released on failure. 753 * 754 * vfs_busy() is a custom lock, it can block the caller. 755 * vfs_busy() only sleeps if the unmount is active on the mount point. 756 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 757 * vnode belonging to mp. 758 * 759 * Lookup uses vfs_busy() to traverse mount points. 760 * root fs var fs 761 * / vnode lock A / vnode lock (/var) D 762 * /var vnode lock B /log vnode lock(/var/log) E 763 * vfs_busy lock C vfs_busy lock F 764 * 765 * Within each file system, the lock order is C->A->B and F->D->E. 766 * 767 * When traversing across mounts, the system follows that lock order: 768 * 769 * C->A->B 770 * | 771 * +->F->D->E 772 * 773 * The lookup() process for namei("/var") illustrates the process: 774 * 1. VOP_LOOKUP() obtains B while A is held 775 * 2. vfs_busy() obtains a shared lock on F while A and B are held 776 * 3. vput() releases lock on B 777 * 4. vput() releases lock on A 778 * 5. VFS_ROOT() obtains lock on D while shared lock on F is held 779 * 6. vfs_unbusy() releases shared lock on F 780 * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 781 * Attempt to lock A (instead of vp_crossmp) while D is held would 782 * violate the global order, causing deadlocks. 783 * 784 * dounmount() locks B while F is drained. Note that for stacked 785 * filesystems, D and B in the example above may be the same lock, 786 * which introdues potential lock order reversal deadlock between 787 * dounmount() and step 5 above. These filesystems may avoid the LOR 788 * by setting VV_CROSSLOCK on the covered vnode so that lock B will 789 * remain held until after step 5. 790 */ 791 int 792 vfs_busy(struct mount *mp, int flags) 793 { 794 struct mount_pcpu *mpcpu; 795 796 MPASS((flags & ~MBF_MASK) == 0); 797 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 798 799 if (vfs_op_thread_enter(mp, mpcpu)) { 800 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 801 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 802 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 803 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 804 vfs_mp_count_add_pcpu(mpcpu, lockref, 1); 805 vfs_op_thread_exit(mp, mpcpu); 806 if (flags & MBF_MNTLSTLOCK) 807 mtx_unlock(&mountlist_mtx); 808 return (0); 809 } 810 811 MNT_ILOCK(mp); 812 vfs_assert_mount_counters(mp); 813 MNT_REF(mp); 814 /* 815 * If mount point is currently being unmounted, sleep until the 816 * mount point fate is decided. If thread doing the unmounting fails, 817 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 818 * that this mount point has survived the unmount attempt and vfs_busy 819 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 820 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 821 * about to be really destroyed. vfs_busy needs to release its 822 * reference on the mount point in this case and return with ENOENT, 823 * telling the caller the mount it tried to busy is no longer valid. 824 */ 825 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 826 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), 827 ("%s: non-empty upper mount list with pending unmount", 828 __func__)); 829 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 830 MNT_REL(mp); 831 MNT_IUNLOCK(mp); 832 CTR1(KTR_VFS, "%s: failed busying before sleeping", 833 __func__); 834 return (ENOENT); 835 } 836 if (flags & MBF_MNTLSTLOCK) 837 mtx_unlock(&mountlist_mtx); 838 mp->mnt_kern_flag |= MNTK_MWAIT; 839 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 840 if (flags & MBF_MNTLSTLOCK) 841 mtx_lock(&mountlist_mtx); 842 MNT_ILOCK(mp); 843 } 844 if (flags & MBF_MNTLSTLOCK) 845 mtx_unlock(&mountlist_mtx); 846 mp->mnt_lockref++; 847 MNT_IUNLOCK(mp); 848 return (0); 849 } 850 851 /* 852 * Free a busy filesystem. 853 */ 854 void 855 vfs_unbusy(struct mount *mp) 856 { 857 struct mount_pcpu *mpcpu; 858 int c; 859 860 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 861 862 if (vfs_op_thread_enter(mp, mpcpu)) { 863 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 864 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); 865 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 866 vfs_op_thread_exit(mp, mpcpu); 867 return; 868 } 869 870 MNT_ILOCK(mp); 871 vfs_assert_mount_counters(mp); 872 MNT_REL(mp); 873 c = --mp->mnt_lockref; 874 if (mp->mnt_vfs_ops == 0) { 875 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 876 MNT_IUNLOCK(mp); 877 return; 878 } 879 if (c < 0) 880 vfs_dump_mount_counters(mp); 881 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 882 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 883 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 884 mp->mnt_kern_flag &= ~MNTK_DRAINING; 885 wakeup(&mp->mnt_lockref); 886 } 887 MNT_IUNLOCK(mp); 888 } 889 890 /* 891 * Lookup a mount point by filesystem identifier. 892 */ 893 struct mount * 894 vfs_getvfs(fsid_t *fsid) 895 { 896 struct mount *mp; 897 898 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 899 mtx_lock(&mountlist_mtx); 900 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 901 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 902 vfs_ref(mp); 903 mtx_unlock(&mountlist_mtx); 904 return (mp); 905 } 906 } 907 mtx_unlock(&mountlist_mtx); 908 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 909 return ((struct mount *) 0); 910 } 911 912 /* 913 * Lookup a mount point by filesystem identifier, busying it before 914 * returning. 915 * 916 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 917 * cache for popular filesystem identifiers. The cache is lockess, using 918 * the fact that struct mount's are never freed. In worst case we may 919 * get pointer to unmounted or even different filesystem, so we have to 920 * check what we got, and go slow way if so. 921 */ 922 struct mount * 923 vfs_busyfs(fsid_t *fsid) 924 { 925 #define FSID_CACHE_SIZE 256 926 typedef struct mount * volatile vmp_t; 927 static vmp_t cache[FSID_CACHE_SIZE]; 928 struct mount *mp; 929 int error; 930 uint32_t hash; 931 932 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 933 hash = fsid->val[0] ^ fsid->val[1]; 934 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 935 mp = cache[hash]; 936 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 937 goto slow; 938 if (vfs_busy(mp, 0) != 0) { 939 cache[hash] = NULL; 940 goto slow; 941 } 942 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 943 return (mp); 944 else 945 vfs_unbusy(mp); 946 947 slow: 948 mtx_lock(&mountlist_mtx); 949 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 950 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 951 error = vfs_busy(mp, MBF_MNTLSTLOCK); 952 if (error) { 953 cache[hash] = NULL; 954 mtx_unlock(&mountlist_mtx); 955 return (NULL); 956 } 957 cache[hash] = mp; 958 return (mp); 959 } 960 } 961 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 962 mtx_unlock(&mountlist_mtx); 963 return ((struct mount *) 0); 964 } 965 966 /* 967 * Check if a user can access privileged mount options. 968 */ 969 int 970 vfs_suser(struct mount *mp, struct thread *td) 971 { 972 int error; 973 974 if (jailed(td->td_ucred)) { 975 /* 976 * If the jail of the calling thread lacks permission for 977 * this type of file system, deny immediately. 978 */ 979 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 980 return (EPERM); 981 982 /* 983 * If the file system was mounted outside the jail of the 984 * calling thread, deny immediately. 985 */ 986 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 987 return (EPERM); 988 } 989 990 /* 991 * If file system supports delegated administration, we don't check 992 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 993 * by the file system itself. 994 * If this is not the user that did original mount, we check for 995 * the PRIV_VFS_MOUNT_OWNER privilege. 996 */ 997 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 998 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 999 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 1000 return (error); 1001 } 1002 return (0); 1003 } 1004 1005 /* 1006 * Get a new unique fsid. Try to make its val[0] unique, since this value 1007 * will be used to create fake device numbers for stat(). Also try (but 1008 * not so hard) make its val[0] unique mod 2^16, since some emulators only 1009 * support 16-bit device numbers. We end up with unique val[0]'s for the 1010 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 1011 * 1012 * Keep in mind that several mounts may be running in parallel. Starting 1013 * the search one past where the previous search terminated is both a 1014 * micro-optimization and a defense against returning the same fsid to 1015 * different mounts. 1016 */ 1017 void 1018 vfs_getnewfsid(struct mount *mp) 1019 { 1020 static uint16_t mntid_base; 1021 struct mount *nmp; 1022 fsid_t tfsid; 1023 int mtype; 1024 1025 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 1026 mtx_lock(&mntid_mtx); 1027 mtype = mp->mnt_vfc->vfc_typenum; 1028 tfsid.val[1] = mtype; 1029 mtype = (mtype & 0xFF) << 24; 1030 for (;;) { 1031 tfsid.val[0] = makedev(255, 1032 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 1033 mntid_base++; 1034 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 1035 break; 1036 vfs_rel(nmp); 1037 } 1038 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 1039 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 1040 mtx_unlock(&mntid_mtx); 1041 } 1042 1043 /* 1044 * Knob to control the precision of file timestamps: 1045 * 1046 * 0 = seconds only; nanoseconds zeroed. 1047 * 1 = seconds and nanoseconds, accurate within 1/HZ. 1048 * 2 = seconds and nanoseconds, truncated to microseconds. 1049 * >=3 = seconds and nanoseconds, maximum precision. 1050 */ 1051 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 1052 1053 static int timestamp_precision = TSP_USEC; 1054 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 1055 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 1056 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 1057 "3+: sec + ns (max. precision))"); 1058 1059 /* 1060 * Get a current timestamp. 1061 */ 1062 void 1063 vfs_timestamp(struct timespec *tsp) 1064 { 1065 struct timeval tv; 1066 1067 switch (timestamp_precision) { 1068 case TSP_SEC: 1069 tsp->tv_sec = time_second; 1070 tsp->tv_nsec = 0; 1071 break; 1072 case TSP_HZ: 1073 getnanotime(tsp); 1074 break; 1075 case TSP_USEC: 1076 microtime(&tv); 1077 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1078 break; 1079 case TSP_NSEC: 1080 default: 1081 nanotime(tsp); 1082 break; 1083 } 1084 } 1085 1086 /* 1087 * Set vnode attributes to VNOVAL 1088 */ 1089 void 1090 vattr_null(struct vattr *vap) 1091 { 1092 1093 vap->va_type = VNON; 1094 vap->va_size = VNOVAL; 1095 vap->va_bytes = VNOVAL; 1096 vap->va_mode = VNOVAL; 1097 vap->va_nlink = VNOVAL; 1098 vap->va_uid = VNOVAL; 1099 vap->va_gid = VNOVAL; 1100 vap->va_fsid = VNOVAL; 1101 vap->va_fileid = VNOVAL; 1102 vap->va_blocksize = VNOVAL; 1103 vap->va_rdev = VNOVAL; 1104 vap->va_atime.tv_sec = VNOVAL; 1105 vap->va_atime.tv_nsec = VNOVAL; 1106 vap->va_mtime.tv_sec = VNOVAL; 1107 vap->va_mtime.tv_nsec = VNOVAL; 1108 vap->va_ctime.tv_sec = VNOVAL; 1109 vap->va_ctime.tv_nsec = VNOVAL; 1110 vap->va_birthtime.tv_sec = VNOVAL; 1111 vap->va_birthtime.tv_nsec = VNOVAL; 1112 vap->va_flags = VNOVAL; 1113 vap->va_gen = VNOVAL; 1114 vap->va_vaflags = 0; 1115 } 1116 1117 /* 1118 * Try to reduce the total number of vnodes. 1119 * 1120 * This routine (and its user) are buggy in at least the following ways: 1121 * - all parameters were picked years ago when RAM sizes were significantly 1122 * smaller 1123 * - it can pick vnodes based on pages used by the vm object, but filesystems 1124 * like ZFS don't use it making the pick broken 1125 * - since ZFS has its own aging policy it gets partially combated by this one 1126 * - a dedicated method should be provided for filesystems to let them decide 1127 * whether the vnode should be recycled 1128 * 1129 * This routine is called when we have too many vnodes. It attempts 1130 * to free <count> vnodes and will potentially free vnodes that still 1131 * have VM backing store (VM backing store is typically the cause 1132 * of a vnode blowout so we want to do this). Therefore, this operation 1133 * is not considered cheap. 1134 * 1135 * A number of conditions may prevent a vnode from being reclaimed. 1136 * the buffer cache may have references on the vnode, a directory 1137 * vnode may still have references due to the namei cache representing 1138 * underlying files, or the vnode may be in active use. It is not 1139 * desirable to reuse such vnodes. These conditions may cause the 1140 * number of vnodes to reach some minimum value regardless of what 1141 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1142 * 1143 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1144 * entries if this argument is strue 1145 * @param trigger Only reclaim vnodes with fewer than this many resident 1146 * pages. 1147 * @param target How many vnodes to reclaim. 1148 * @return The number of vnodes that were reclaimed. 1149 */ 1150 static int 1151 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1152 { 1153 struct vnode *vp, *mvp; 1154 struct mount *mp; 1155 struct vm_object *object; 1156 u_long done; 1157 bool retried; 1158 1159 mtx_assert(&vnode_list_mtx, MA_OWNED); 1160 1161 retried = false; 1162 done = 0; 1163 1164 mvp = vnode_list_reclaim_marker; 1165 restart: 1166 vp = mvp; 1167 while (done < target) { 1168 vp = TAILQ_NEXT(vp, v_vnodelist); 1169 if (__predict_false(vp == NULL)) 1170 break; 1171 1172 if (__predict_false(vp->v_type == VMARKER)) 1173 continue; 1174 1175 /* 1176 * If it's been deconstructed already, it's still 1177 * referenced, or it exceeds the trigger, skip it. 1178 * Also skip free vnodes. We are trying to make space 1179 * to expand the free list, not reduce it. 1180 */ 1181 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1182 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1183 goto next_iter; 1184 1185 if (vp->v_type == VBAD || vp->v_type == VNON) 1186 goto next_iter; 1187 1188 object = atomic_load_ptr(&vp->v_object); 1189 if (object == NULL || object->resident_page_count > trigger) { 1190 goto next_iter; 1191 } 1192 1193 /* 1194 * Handle races against vnode allocation. Filesystems lock the 1195 * vnode some time after it gets returned from getnewvnode, 1196 * despite type and hold count being manipulated earlier. 1197 * Resorting to checking v_mount restores guarantees present 1198 * before the global list was reworked to contain all vnodes. 1199 */ 1200 if (!VI_TRYLOCK(vp)) 1201 goto next_iter; 1202 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1203 VI_UNLOCK(vp); 1204 goto next_iter; 1205 } 1206 if (vp->v_mount == NULL) { 1207 VI_UNLOCK(vp); 1208 goto next_iter; 1209 } 1210 vholdl(vp); 1211 VI_UNLOCK(vp); 1212 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1213 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1214 mtx_unlock(&vnode_list_mtx); 1215 1216 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1217 vdrop_recycle(vp); 1218 goto next_iter_unlocked; 1219 } 1220 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1221 vdrop_recycle(vp); 1222 vn_finished_write(mp); 1223 goto next_iter_unlocked; 1224 } 1225 1226 VI_LOCK(vp); 1227 if (vp->v_usecount > 0 || 1228 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1229 (vp->v_object != NULL && vp->v_object->handle == vp && 1230 vp->v_object->resident_page_count > trigger)) { 1231 VOP_UNLOCK(vp); 1232 vdropl_recycle(vp); 1233 vn_finished_write(mp); 1234 goto next_iter_unlocked; 1235 } 1236 counter_u64_add(recycles_count, 1); 1237 vgonel(vp); 1238 VOP_UNLOCK(vp); 1239 vdropl_recycle(vp); 1240 vn_finished_write(mp); 1241 done++; 1242 next_iter_unlocked: 1243 if (should_yield()) 1244 kern_yield(PRI_USER); 1245 mtx_lock(&vnode_list_mtx); 1246 goto restart; 1247 next_iter: 1248 MPASS(vp->v_type != VMARKER); 1249 if (!should_yield()) 1250 continue; 1251 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1252 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1253 mtx_unlock(&vnode_list_mtx); 1254 kern_yield(PRI_USER); 1255 mtx_lock(&vnode_list_mtx); 1256 goto restart; 1257 } 1258 if (done == 0 && !retried) { 1259 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1260 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1261 retried = true; 1262 goto restart; 1263 } 1264 return (done); 1265 } 1266 1267 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 1268 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 1269 0, 1270 "limit on vnode free requests per call to the vnlru_free routine"); 1271 1272 /* 1273 * Attempt to reduce the free list by the requested amount. 1274 */ 1275 static int 1276 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp) 1277 { 1278 struct vnode *vp; 1279 struct mount *mp; 1280 int ocount; 1281 1282 mtx_assert(&vnode_list_mtx, MA_OWNED); 1283 if (count > max_vnlru_free) 1284 count = max_vnlru_free; 1285 ocount = count; 1286 vp = mvp; 1287 for (;;) { 1288 if (count == 0) { 1289 break; 1290 } 1291 vp = TAILQ_NEXT(vp, v_vnodelist); 1292 if (__predict_false(vp == NULL)) { 1293 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1294 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1295 break; 1296 } 1297 if (__predict_false(vp->v_type == VMARKER)) 1298 continue; 1299 if (vp->v_holdcnt > 0) 1300 continue; 1301 /* 1302 * Don't recycle if our vnode is from different type 1303 * of mount point. Note that mp is type-safe, the 1304 * check does not reach unmapped address even if 1305 * vnode is reclaimed. 1306 */ 1307 if (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1308 mp->mnt_op != mnt_op) { 1309 continue; 1310 } 1311 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1312 continue; 1313 } 1314 if (!vhold_recycle_free(vp)) 1315 continue; 1316 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1317 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1318 mtx_unlock(&vnode_list_mtx); 1319 /* 1320 * FIXME: ignores the return value, meaning it may be nothing 1321 * got recycled but it claims otherwise to the caller. 1322 * 1323 * Originally the value started being ignored in 2005 with 1324 * 114a1006a8204aa156e1f9ad6476cdff89cada7f . 1325 * 1326 * Respecting the value can run into significant stalls if most 1327 * vnodes belong to one file system and it has writes 1328 * suspended. In presence of many threads and millions of 1329 * vnodes they keep contending on the vnode_list_mtx lock only 1330 * to find vnodes they can't recycle. 1331 * 1332 * The solution would be to pre-check if the vnode is likely to 1333 * be recycle-able, but it needs to happen with the 1334 * vnode_list_mtx lock held. This runs into a problem where 1335 * VOP_GETWRITEMOUNT (currently needed to find out about if 1336 * writes are frozen) can take locks which LOR against it. 1337 * 1338 * Check nullfs for one example (null_getwritemount). 1339 */ 1340 vtryrecycle(vp); 1341 count--; 1342 mtx_lock(&vnode_list_mtx); 1343 vp = mvp; 1344 } 1345 return (ocount - count); 1346 } 1347 1348 static int 1349 vnlru_free_locked(int count) 1350 { 1351 1352 mtx_assert(&vnode_list_mtx, MA_OWNED); 1353 return (vnlru_free_impl(count, NULL, vnode_list_free_marker)); 1354 } 1355 1356 void 1357 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) 1358 { 1359 1360 MPASS(mnt_op != NULL); 1361 MPASS(mvp != NULL); 1362 VNPASS(mvp->v_type == VMARKER, mvp); 1363 mtx_lock(&vnode_list_mtx); 1364 vnlru_free_impl(count, mnt_op, mvp); 1365 mtx_unlock(&vnode_list_mtx); 1366 } 1367 1368 struct vnode * 1369 vnlru_alloc_marker(void) 1370 { 1371 struct vnode *mvp; 1372 1373 mvp = vn_alloc_marker(NULL); 1374 mtx_lock(&vnode_list_mtx); 1375 TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); 1376 mtx_unlock(&vnode_list_mtx); 1377 return (mvp); 1378 } 1379 1380 void 1381 vnlru_free_marker(struct vnode *mvp) 1382 { 1383 mtx_lock(&vnode_list_mtx); 1384 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1385 mtx_unlock(&vnode_list_mtx); 1386 vn_free_marker(mvp); 1387 } 1388 1389 static void 1390 vnlru_recalc(void) 1391 { 1392 1393 mtx_assert(&vnode_list_mtx, MA_OWNED); 1394 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1395 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1396 vlowat = vhiwat / 2; 1397 } 1398 1399 /* 1400 * Attempt to recycle vnodes in a context that is always safe to block. 1401 * Calling vlrurecycle() from the bowels of filesystem code has some 1402 * interesting deadlock problems. 1403 */ 1404 static struct proc *vnlruproc; 1405 static int vnlruproc_sig; 1406 1407 /* 1408 * The main freevnodes counter is only updated when threads requeue their vnode 1409 * batches. CPUs are conditionally walked to compute a more accurate total. 1410 * 1411 * Limit how much of a slop are we willing to tolerate. Note: the actual value 1412 * at any given moment can still exceed slop, but it should not be by significant 1413 * margin in practice. 1414 */ 1415 #define VNLRU_FREEVNODES_SLOP 128 1416 1417 static __inline void 1418 vfs_freevnodes_inc(void) 1419 { 1420 struct vdbatch *vd; 1421 1422 critical_enter(); 1423 vd = DPCPU_PTR(vd); 1424 vd->freevnodes++; 1425 critical_exit(); 1426 } 1427 1428 static __inline void 1429 vfs_freevnodes_dec(void) 1430 { 1431 struct vdbatch *vd; 1432 1433 critical_enter(); 1434 vd = DPCPU_PTR(vd); 1435 vd->freevnodes--; 1436 critical_exit(); 1437 } 1438 1439 static u_long 1440 vnlru_read_freevnodes(void) 1441 { 1442 struct vdbatch *vd; 1443 long slop; 1444 int cpu; 1445 1446 mtx_assert(&vnode_list_mtx, MA_OWNED); 1447 if (freevnodes > freevnodes_old) 1448 slop = freevnodes - freevnodes_old; 1449 else 1450 slop = freevnodes_old - freevnodes; 1451 if (slop < VNLRU_FREEVNODES_SLOP) 1452 return (freevnodes >= 0 ? freevnodes : 0); 1453 freevnodes_old = freevnodes; 1454 CPU_FOREACH(cpu) { 1455 vd = DPCPU_ID_PTR((cpu), vd); 1456 freevnodes_old += vd->freevnodes; 1457 } 1458 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1459 } 1460 1461 static bool 1462 vnlru_under(u_long rnumvnodes, u_long limit) 1463 { 1464 u_long rfreevnodes, space; 1465 1466 if (__predict_false(rnumvnodes > desiredvnodes)) 1467 return (true); 1468 1469 space = desiredvnodes - rnumvnodes; 1470 if (space < limit) { 1471 rfreevnodes = vnlru_read_freevnodes(); 1472 if (rfreevnodes > wantfreevnodes) 1473 space += rfreevnodes - wantfreevnodes; 1474 } 1475 return (space < limit); 1476 } 1477 1478 static bool 1479 vnlru_under_unlocked(u_long rnumvnodes, u_long limit) 1480 { 1481 long rfreevnodes, space; 1482 1483 if (__predict_false(rnumvnodes > desiredvnodes)) 1484 return (true); 1485 1486 space = desiredvnodes - rnumvnodes; 1487 if (space < limit) { 1488 rfreevnodes = atomic_load_long(&freevnodes); 1489 if (rfreevnodes > wantfreevnodes) 1490 space += rfreevnodes - wantfreevnodes; 1491 } 1492 return (space < limit); 1493 } 1494 1495 static void 1496 vnlru_kick(void) 1497 { 1498 1499 mtx_assert(&vnode_list_mtx, MA_OWNED); 1500 if (vnlruproc_sig == 0) { 1501 vnlruproc_sig = 1; 1502 wakeup(vnlruproc); 1503 } 1504 } 1505 1506 static void 1507 vnlru_proc(void) 1508 { 1509 u_long rnumvnodes, rfreevnodes, target; 1510 unsigned long onumvnodes; 1511 int done, force, trigger, usevnodes; 1512 bool reclaim_nc_src, want_reread; 1513 1514 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1515 SHUTDOWN_PRI_FIRST); 1516 1517 force = 0; 1518 want_reread = false; 1519 for (;;) { 1520 kproc_suspend_check(vnlruproc); 1521 mtx_lock(&vnode_list_mtx); 1522 rnumvnodes = atomic_load_long(&numvnodes); 1523 1524 if (want_reread) { 1525 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1526 want_reread = false; 1527 } 1528 1529 /* 1530 * If numvnodes is too large (due to desiredvnodes being 1531 * adjusted using its sysctl, or emergency growth), first 1532 * try to reduce it by discarding from the free list. 1533 */ 1534 if (rnumvnodes > desiredvnodes) { 1535 vnlru_free_locked(rnumvnodes - desiredvnodes); 1536 rnumvnodes = atomic_load_long(&numvnodes); 1537 } 1538 /* 1539 * Sleep if the vnode cache is in a good state. This is 1540 * when it is not over-full and has space for about a 4% 1541 * or 9% expansion (by growing its size or inexcessively 1542 * reducing its free list). Otherwise, try to reclaim 1543 * space for a 10% expansion. 1544 */ 1545 if (vstir && force == 0) { 1546 force = 1; 1547 vstir = 0; 1548 } 1549 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1550 vnlruproc_sig = 0; 1551 wakeup(&vnlruproc_sig); 1552 msleep(vnlruproc, &vnode_list_mtx, 1553 PVFS|PDROP, "vlruwt", hz); 1554 continue; 1555 } 1556 rfreevnodes = vnlru_read_freevnodes(); 1557 1558 onumvnodes = rnumvnodes; 1559 /* 1560 * Calculate parameters for recycling. These are the same 1561 * throughout the loop to give some semblance of fairness. 1562 * The trigger point is to avoid recycling vnodes with lots 1563 * of resident pages. We aren't trying to free memory; we 1564 * are trying to recycle or at least free vnodes. 1565 */ 1566 if (rnumvnodes <= desiredvnodes) 1567 usevnodes = rnumvnodes - rfreevnodes; 1568 else 1569 usevnodes = rnumvnodes; 1570 if (usevnodes <= 0) 1571 usevnodes = 1; 1572 /* 1573 * The trigger value is chosen to give a conservatively 1574 * large value to ensure that it alone doesn't prevent 1575 * making progress. The value can easily be so large that 1576 * it is effectively infinite in some congested and 1577 * misconfigured cases, and this is necessary. Normally 1578 * it is about 8 to 100 (pages), which is quite large. 1579 */ 1580 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1581 if (force < 2) 1582 trigger = vsmalltrigger; 1583 reclaim_nc_src = force >= 3; 1584 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1585 target = target / 10 + 1; 1586 done = vlrureclaim(reclaim_nc_src, trigger, target); 1587 mtx_unlock(&vnode_list_mtx); 1588 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1589 uma_reclaim(UMA_RECLAIM_DRAIN); 1590 if (done == 0) { 1591 if (force == 0 || force == 1) { 1592 force = 2; 1593 continue; 1594 } 1595 if (force == 2) { 1596 force = 3; 1597 continue; 1598 } 1599 want_reread = true; 1600 force = 0; 1601 vnlru_nowhere++; 1602 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1603 } else { 1604 want_reread = true; 1605 kern_yield(PRI_USER); 1606 } 1607 } 1608 } 1609 1610 static struct kproc_desc vnlru_kp = { 1611 "vnlru", 1612 vnlru_proc, 1613 &vnlruproc 1614 }; 1615 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1616 &vnlru_kp); 1617 1618 /* 1619 * Routines having to do with the management of the vnode table. 1620 */ 1621 1622 /* 1623 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1624 * before we actually vgone(). This function must be called with the vnode 1625 * held to prevent the vnode from being returned to the free list midway 1626 * through vgone(). 1627 */ 1628 static int 1629 vtryrecycle(struct vnode *vp) 1630 { 1631 struct mount *vnmp; 1632 1633 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1634 VNASSERT(vp->v_holdcnt, vp, 1635 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1636 /* 1637 * This vnode may found and locked via some other list, if so we 1638 * can't recycle it yet. 1639 */ 1640 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1641 CTR2(KTR_VFS, 1642 "%s: impossible to recycle, vp %p lock is already held", 1643 __func__, vp); 1644 vdrop_recycle(vp); 1645 return (EWOULDBLOCK); 1646 } 1647 /* 1648 * Don't recycle if its filesystem is being suspended. 1649 */ 1650 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1651 VOP_UNLOCK(vp); 1652 CTR2(KTR_VFS, 1653 "%s: impossible to recycle, cannot start the write for %p", 1654 __func__, vp); 1655 vdrop_recycle(vp); 1656 return (EBUSY); 1657 } 1658 /* 1659 * If we got this far, we need to acquire the interlock and see if 1660 * anyone picked up this vnode from another list. If not, we will 1661 * mark it with DOOMED via vgonel() so that anyone who does find it 1662 * will skip over it. 1663 */ 1664 VI_LOCK(vp); 1665 if (vp->v_usecount) { 1666 VOP_UNLOCK(vp); 1667 vdropl_recycle(vp); 1668 vn_finished_write(vnmp); 1669 CTR2(KTR_VFS, 1670 "%s: impossible to recycle, %p is already referenced", 1671 __func__, vp); 1672 return (EBUSY); 1673 } 1674 if (!VN_IS_DOOMED(vp)) { 1675 counter_u64_add(recycles_free_count, 1); 1676 vgonel(vp); 1677 } 1678 VOP_UNLOCK(vp); 1679 vdropl_recycle(vp); 1680 vn_finished_write(vnmp); 1681 return (0); 1682 } 1683 1684 /* 1685 * Allocate a new vnode. 1686 * 1687 * The operation never returns an error. Returning an error was disabled 1688 * in r145385 (dated 2005) with the following comment: 1689 * 1690 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1691 * 1692 * Given the age of this commit (almost 15 years at the time of writing this 1693 * comment) restoring the ability to fail requires a significant audit of 1694 * all codepaths. 1695 * 1696 * The routine can try to free a vnode or stall for up to 1 second waiting for 1697 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1698 */ 1699 static u_long vn_alloc_cyclecount; 1700 1701 static struct vnode * __noinline 1702 vn_alloc_hard(struct mount *mp) 1703 { 1704 u_long rnumvnodes, rfreevnodes; 1705 1706 mtx_lock(&vnode_list_mtx); 1707 rnumvnodes = atomic_load_long(&numvnodes); 1708 if (rnumvnodes + 1 < desiredvnodes) { 1709 vn_alloc_cyclecount = 0; 1710 goto alloc; 1711 } 1712 rfreevnodes = vnlru_read_freevnodes(); 1713 if (vn_alloc_cyclecount++ >= rfreevnodes) { 1714 vn_alloc_cyclecount = 0; 1715 vstir = 1; 1716 } 1717 /* 1718 * Grow the vnode cache if it will not be above its target max 1719 * after growing. Otherwise, if the free list is nonempty, try 1720 * to reclaim 1 item from it before growing the cache (possibly 1721 * above its target max if the reclamation failed or is delayed). 1722 * Otherwise, wait for some space. In all cases, schedule 1723 * vnlru_proc() if we are getting short of space. The watermarks 1724 * should be chosen so that we never wait or even reclaim from 1725 * the free list to below its target minimum. 1726 */ 1727 if (vnlru_free_locked(1) > 0) 1728 goto alloc; 1729 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 1730 /* 1731 * Wait for space for a new vnode. 1732 */ 1733 vnlru_kick(); 1734 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 1735 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 1736 vnlru_read_freevnodes() > 1) 1737 vnlru_free_locked(1); 1738 } 1739 alloc: 1740 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1741 if (vnlru_under(rnumvnodes, vlowat)) 1742 vnlru_kick(); 1743 mtx_unlock(&vnode_list_mtx); 1744 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1745 } 1746 1747 static struct vnode * 1748 vn_alloc(struct mount *mp) 1749 { 1750 u_long rnumvnodes; 1751 1752 if (__predict_false(vn_alloc_cyclecount != 0)) 1753 return (vn_alloc_hard(mp)); 1754 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1755 if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { 1756 atomic_subtract_long(&numvnodes, 1); 1757 return (vn_alloc_hard(mp)); 1758 } 1759 1760 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1761 } 1762 1763 static void 1764 vn_free(struct vnode *vp) 1765 { 1766 1767 atomic_subtract_long(&numvnodes, 1); 1768 uma_zfree_smr(vnode_zone, vp); 1769 } 1770 1771 /* 1772 * Return the next vnode from the free list. 1773 */ 1774 int 1775 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1776 struct vnode **vpp) 1777 { 1778 struct vnode *vp; 1779 struct thread *td; 1780 struct lock_object *lo; 1781 1782 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1783 1784 KASSERT(vops->registered, 1785 ("%s: not registered vector op %p\n", __func__, vops)); 1786 1787 td = curthread; 1788 if (td->td_vp_reserved != NULL) { 1789 vp = td->td_vp_reserved; 1790 td->td_vp_reserved = NULL; 1791 } else { 1792 vp = vn_alloc(mp); 1793 } 1794 counter_u64_add(vnodes_created, 1); 1795 /* 1796 * Locks are given the generic name "vnode" when created. 1797 * Follow the historic practice of using the filesystem 1798 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1799 * 1800 * Locks live in a witness group keyed on their name. Thus, 1801 * when a lock is renamed, it must also move from the witness 1802 * group of its old name to the witness group of its new name. 1803 * 1804 * The change only needs to be made when the vnode moves 1805 * from one filesystem type to another. We ensure that each 1806 * filesystem use a single static name pointer for its tag so 1807 * that we can compare pointers rather than doing a strcmp(). 1808 */ 1809 lo = &vp->v_vnlock->lock_object; 1810 #ifdef WITNESS 1811 if (lo->lo_name != tag) { 1812 #endif 1813 lo->lo_name = tag; 1814 #ifdef WITNESS 1815 WITNESS_DESTROY(lo); 1816 WITNESS_INIT(lo, tag); 1817 } 1818 #endif 1819 /* 1820 * By default, don't allow shared locks unless filesystems opt-in. 1821 */ 1822 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1823 /* 1824 * Finalize various vnode identity bits. 1825 */ 1826 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1827 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1828 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1829 vp->v_type = VNON; 1830 vp->v_op = vops; 1831 vp->v_irflag = 0; 1832 v_init_counters(vp); 1833 vn_seqc_init(vp); 1834 vp->v_bufobj.bo_ops = &buf_ops_bio; 1835 #ifdef DIAGNOSTIC 1836 if (mp == NULL && vops != &dead_vnodeops) 1837 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 1838 #endif 1839 #ifdef MAC 1840 mac_vnode_init(vp); 1841 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1842 mac_vnode_associate_singlelabel(mp, vp); 1843 #endif 1844 if (mp != NULL) { 1845 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1846 } 1847 1848 /* 1849 * For the filesystems which do not use vfs_hash_insert(), 1850 * still initialize v_hash to have vfs_hash_index() useful. 1851 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1852 * its own hashing. 1853 */ 1854 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1855 1856 *vpp = vp; 1857 return (0); 1858 } 1859 1860 void 1861 getnewvnode_reserve(void) 1862 { 1863 struct thread *td; 1864 1865 td = curthread; 1866 MPASS(td->td_vp_reserved == NULL); 1867 td->td_vp_reserved = vn_alloc(NULL); 1868 } 1869 1870 void 1871 getnewvnode_drop_reserve(void) 1872 { 1873 struct thread *td; 1874 1875 td = curthread; 1876 if (td->td_vp_reserved != NULL) { 1877 vn_free(td->td_vp_reserved); 1878 td->td_vp_reserved = NULL; 1879 } 1880 } 1881 1882 static void __noinline 1883 freevnode(struct vnode *vp) 1884 { 1885 struct bufobj *bo; 1886 1887 /* 1888 * The vnode has been marked for destruction, so free it. 1889 * 1890 * The vnode will be returned to the zone where it will 1891 * normally remain until it is needed for another vnode. We 1892 * need to cleanup (or verify that the cleanup has already 1893 * been done) any residual data left from its current use 1894 * so as not to contaminate the freshly allocated vnode. 1895 */ 1896 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 1897 /* 1898 * Paired with vgone. 1899 */ 1900 vn_seqc_write_end_free(vp); 1901 1902 bo = &vp->v_bufobj; 1903 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 1904 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 1905 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 1906 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 1907 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 1908 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 1909 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 1910 ("clean blk trie not empty")); 1911 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 1912 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 1913 ("dirty blk trie not empty")); 1914 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 1915 ("Dangling rangelock waiters")); 1916 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, 1917 ("Leaked inactivation")); 1918 VI_UNLOCK(vp); 1919 cache_assert_no_entries(vp); 1920 1921 #ifdef MAC 1922 mac_vnode_destroy(vp); 1923 #endif 1924 if (vp->v_pollinfo != NULL) { 1925 /* 1926 * Use LK_NOWAIT to shut up witness about the lock. We may get 1927 * here while having another vnode locked when trying to 1928 * satisfy a lookup and needing to recycle. 1929 */ 1930 VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); 1931 destroy_vpollinfo(vp->v_pollinfo); 1932 VOP_UNLOCK(vp); 1933 vp->v_pollinfo = NULL; 1934 } 1935 vp->v_mountedhere = NULL; 1936 vp->v_unpcb = NULL; 1937 vp->v_rdev = NULL; 1938 vp->v_fifoinfo = NULL; 1939 vp->v_iflag = 0; 1940 vp->v_vflag = 0; 1941 bo->bo_flag = 0; 1942 vn_free(vp); 1943 } 1944 1945 /* 1946 * Delete from old mount point vnode list, if on one. 1947 */ 1948 static void 1949 delmntque(struct vnode *vp) 1950 { 1951 struct mount *mp; 1952 1953 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 1954 1955 mp = vp->v_mount; 1956 MNT_ILOCK(mp); 1957 VI_LOCK(vp); 1958 vp->v_mount = NULL; 1959 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1960 ("bad mount point vnode list size")); 1961 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1962 mp->mnt_nvnodelistsize--; 1963 MNT_REL(mp); 1964 MNT_IUNLOCK(mp); 1965 /* 1966 * The caller expects the interlock to be still held. 1967 */ 1968 ASSERT_VI_LOCKED(vp, __func__); 1969 } 1970 1971 static int 1972 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) 1973 { 1974 1975 KASSERT(vp->v_mount == NULL, 1976 ("insmntque: vnode already on per mount vnode list")); 1977 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1978 if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { 1979 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1980 } else { 1981 KASSERT(!dtr, 1982 ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", 1983 __func__)); 1984 } 1985 1986 /* 1987 * We acquire the vnode interlock early to ensure that the 1988 * vnode cannot be recycled by another process releasing a 1989 * holdcnt on it before we get it on both the vnode list 1990 * and the active vnode list. The mount mutex protects only 1991 * manipulation of the vnode list and the vnode freelist 1992 * mutex protects only manipulation of the active vnode list. 1993 * Hence the need to hold the vnode interlock throughout. 1994 */ 1995 MNT_ILOCK(mp); 1996 VI_LOCK(vp); 1997 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 1998 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1999 mp->mnt_nvnodelistsize == 0)) && 2000 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 2001 VI_UNLOCK(vp); 2002 MNT_IUNLOCK(mp); 2003 if (dtr) { 2004 vp->v_data = NULL; 2005 vp->v_op = &dead_vnodeops; 2006 vgone(vp); 2007 vput(vp); 2008 } 2009 return (EBUSY); 2010 } 2011 vp->v_mount = mp; 2012 MNT_REF(mp); 2013 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2014 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 2015 ("neg mount point vnode list size")); 2016 mp->mnt_nvnodelistsize++; 2017 VI_UNLOCK(vp); 2018 MNT_IUNLOCK(mp); 2019 return (0); 2020 } 2021 2022 /* 2023 * Insert into list of vnodes for the new mount point, if available. 2024 * insmntque() reclaims the vnode on insertion failure, insmntque1() 2025 * leaves handling of the vnode to the caller. 2026 */ 2027 int 2028 insmntque(struct vnode *vp, struct mount *mp) 2029 { 2030 return (insmntque1_int(vp, mp, true)); 2031 } 2032 2033 int 2034 insmntque1(struct vnode *vp, struct mount *mp) 2035 { 2036 return (insmntque1_int(vp, mp, false)); 2037 } 2038 2039 /* 2040 * Flush out and invalidate all buffers associated with a bufobj 2041 * Called with the underlying object locked. 2042 */ 2043 int 2044 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 2045 { 2046 int error; 2047 2048 BO_LOCK(bo); 2049 if (flags & V_SAVE) { 2050 error = bufobj_wwait(bo, slpflag, slptimeo); 2051 if (error) { 2052 BO_UNLOCK(bo); 2053 return (error); 2054 } 2055 if (bo->bo_dirty.bv_cnt > 0) { 2056 BO_UNLOCK(bo); 2057 do { 2058 error = BO_SYNC(bo, MNT_WAIT); 2059 } while (error == ERELOOKUP); 2060 if (error != 0) 2061 return (error); 2062 BO_LOCK(bo); 2063 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { 2064 BO_UNLOCK(bo); 2065 return (EBUSY); 2066 } 2067 } 2068 } 2069 /* 2070 * If you alter this loop please notice that interlock is dropped and 2071 * reacquired in flushbuflist. Special care is needed to ensure that 2072 * no race conditions occur from this. 2073 */ 2074 do { 2075 error = flushbuflist(&bo->bo_clean, 2076 flags, bo, slpflag, slptimeo); 2077 if (error == 0 && !(flags & V_CLEANONLY)) 2078 error = flushbuflist(&bo->bo_dirty, 2079 flags, bo, slpflag, slptimeo); 2080 if (error != 0 && error != EAGAIN) { 2081 BO_UNLOCK(bo); 2082 return (error); 2083 } 2084 } while (error != 0); 2085 2086 /* 2087 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 2088 * have write I/O in-progress but if there is a VM object then the 2089 * VM object can also have read-I/O in-progress. 2090 */ 2091 do { 2092 bufobj_wwait(bo, 0, 0); 2093 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 2094 BO_UNLOCK(bo); 2095 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 2096 BO_LOCK(bo); 2097 } 2098 } while (bo->bo_numoutput > 0); 2099 BO_UNLOCK(bo); 2100 2101 /* 2102 * Destroy the copy in the VM cache, too. 2103 */ 2104 if (bo->bo_object != NULL && 2105 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 2106 VM_OBJECT_WLOCK(bo->bo_object); 2107 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 2108 OBJPR_CLEANONLY : 0); 2109 VM_OBJECT_WUNLOCK(bo->bo_object); 2110 } 2111 2112 #ifdef INVARIANTS 2113 BO_LOCK(bo); 2114 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 2115 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 2116 bo->bo_clean.bv_cnt > 0)) 2117 panic("vinvalbuf: flush failed"); 2118 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 2119 bo->bo_dirty.bv_cnt > 0) 2120 panic("vinvalbuf: flush dirty failed"); 2121 BO_UNLOCK(bo); 2122 #endif 2123 return (0); 2124 } 2125 2126 /* 2127 * Flush out and invalidate all buffers associated with a vnode. 2128 * Called with the underlying object locked. 2129 */ 2130 int 2131 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 2132 { 2133 2134 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2135 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 2136 if (vp->v_object != NULL && vp->v_object->handle != vp) 2137 return (0); 2138 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 2139 } 2140 2141 /* 2142 * Flush out buffers on the specified list. 2143 * 2144 */ 2145 static int 2146 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 2147 int slptimeo) 2148 { 2149 struct buf *bp, *nbp; 2150 int retval, error; 2151 daddr_t lblkno; 2152 b_xflags_t xflags; 2153 2154 ASSERT_BO_WLOCKED(bo); 2155 2156 retval = 0; 2157 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2158 /* 2159 * If we are flushing both V_NORMAL and V_ALT buffers then 2160 * do not skip any buffers. If we are flushing only V_NORMAL 2161 * buffers then skip buffers marked as BX_ALTDATA. If we are 2162 * flushing only V_ALT buffers then skip buffers not marked 2163 * as BX_ALTDATA. 2164 */ 2165 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2166 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2167 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2168 continue; 2169 } 2170 if (nbp != NULL) { 2171 lblkno = nbp->b_lblkno; 2172 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2173 } 2174 retval = EAGAIN; 2175 error = BUF_TIMELOCK(bp, 2176 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2177 "flushbuf", slpflag, slptimeo); 2178 if (error) { 2179 BO_LOCK(bo); 2180 return (error != ENOLCK ? error : EAGAIN); 2181 } 2182 KASSERT(bp->b_bufobj == bo, 2183 ("bp %p wrong b_bufobj %p should be %p", 2184 bp, bp->b_bufobj, bo)); 2185 /* 2186 * XXX Since there are no node locks for NFS, I 2187 * believe there is a slight chance that a delayed 2188 * write will occur while sleeping just above, so 2189 * check for it. 2190 */ 2191 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2192 (flags & V_SAVE)) { 2193 bremfree(bp); 2194 bp->b_flags |= B_ASYNC; 2195 bwrite(bp); 2196 BO_LOCK(bo); 2197 return (EAGAIN); /* XXX: why not loop ? */ 2198 } 2199 bremfree(bp); 2200 bp->b_flags |= (B_INVAL | B_RELBUF); 2201 bp->b_flags &= ~B_ASYNC; 2202 brelse(bp); 2203 BO_LOCK(bo); 2204 if (nbp == NULL) 2205 break; 2206 nbp = gbincore(bo, lblkno); 2207 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2208 != xflags) 2209 break; /* nbp invalid */ 2210 } 2211 return (retval); 2212 } 2213 2214 int 2215 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2216 { 2217 struct buf *bp; 2218 int error; 2219 daddr_t lblkno; 2220 2221 ASSERT_BO_LOCKED(bo); 2222 2223 for (lblkno = startn;;) { 2224 again: 2225 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 2226 if (bp == NULL || bp->b_lblkno >= endn || 2227 bp->b_lblkno < startn) 2228 break; 2229 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2230 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2231 if (error != 0) { 2232 BO_RLOCK(bo); 2233 if (error == ENOLCK) 2234 goto again; 2235 return (error); 2236 } 2237 KASSERT(bp->b_bufobj == bo, 2238 ("bp %p wrong b_bufobj %p should be %p", 2239 bp, bp->b_bufobj, bo)); 2240 lblkno = bp->b_lblkno + 1; 2241 if ((bp->b_flags & B_MANAGED) == 0) 2242 bremfree(bp); 2243 bp->b_flags |= B_RELBUF; 2244 /* 2245 * In the VMIO case, use the B_NOREUSE flag to hint that the 2246 * pages backing each buffer in the range are unlikely to be 2247 * reused. Dirty buffers will have the hint applied once 2248 * they've been written. 2249 */ 2250 if ((bp->b_flags & B_VMIO) != 0) 2251 bp->b_flags |= B_NOREUSE; 2252 brelse(bp); 2253 BO_RLOCK(bo); 2254 } 2255 return (0); 2256 } 2257 2258 /* 2259 * Truncate a file's buffer and pages to a specified length. This 2260 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2261 * sync activity. 2262 */ 2263 int 2264 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2265 { 2266 struct buf *bp, *nbp; 2267 struct bufobj *bo; 2268 daddr_t startlbn; 2269 2270 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2271 vp, blksize, (uintmax_t)length); 2272 2273 /* 2274 * Round up to the *next* lbn. 2275 */ 2276 startlbn = howmany(length, blksize); 2277 2278 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2279 2280 bo = &vp->v_bufobj; 2281 restart_unlocked: 2282 BO_LOCK(bo); 2283 2284 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2285 ; 2286 2287 if (length > 0) { 2288 restartsync: 2289 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2290 if (bp->b_lblkno > 0) 2291 continue; 2292 /* 2293 * Since we hold the vnode lock this should only 2294 * fail if we're racing with the buf daemon. 2295 */ 2296 if (BUF_LOCK(bp, 2297 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2298 BO_LOCKPTR(bo)) == ENOLCK) 2299 goto restart_unlocked; 2300 2301 VNASSERT((bp->b_flags & B_DELWRI), vp, 2302 ("buf(%p) on dirty queue without DELWRI", bp)); 2303 2304 bremfree(bp); 2305 bawrite(bp); 2306 BO_LOCK(bo); 2307 goto restartsync; 2308 } 2309 } 2310 2311 bufobj_wwait(bo, 0, 0); 2312 BO_UNLOCK(bo); 2313 vnode_pager_setsize(vp, length); 2314 2315 return (0); 2316 } 2317 2318 /* 2319 * Invalidate the cached pages of a file's buffer within the range of block 2320 * numbers [startlbn, endlbn). 2321 */ 2322 void 2323 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2324 int blksize) 2325 { 2326 struct bufobj *bo; 2327 off_t start, end; 2328 2329 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2330 2331 start = blksize * startlbn; 2332 end = blksize * endlbn; 2333 2334 bo = &vp->v_bufobj; 2335 BO_LOCK(bo); 2336 MPASS(blksize == bo->bo_bsize); 2337 2338 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2339 ; 2340 2341 BO_UNLOCK(bo); 2342 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2343 } 2344 2345 static int 2346 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2347 daddr_t startlbn, daddr_t endlbn) 2348 { 2349 struct buf *bp, *nbp; 2350 bool anyfreed; 2351 2352 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2353 ASSERT_BO_LOCKED(bo); 2354 2355 do { 2356 anyfreed = false; 2357 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 2358 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2359 continue; 2360 if (BUF_LOCK(bp, 2361 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2362 BO_LOCKPTR(bo)) == ENOLCK) { 2363 BO_LOCK(bo); 2364 return (EAGAIN); 2365 } 2366 2367 bremfree(bp); 2368 bp->b_flags |= B_INVAL | B_RELBUF; 2369 bp->b_flags &= ~B_ASYNC; 2370 brelse(bp); 2371 anyfreed = true; 2372 2373 BO_LOCK(bo); 2374 if (nbp != NULL && 2375 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 2376 nbp->b_vp != vp || 2377 (nbp->b_flags & B_DELWRI) != 0)) 2378 return (EAGAIN); 2379 } 2380 2381 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2382 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2383 continue; 2384 if (BUF_LOCK(bp, 2385 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2386 BO_LOCKPTR(bo)) == ENOLCK) { 2387 BO_LOCK(bo); 2388 return (EAGAIN); 2389 } 2390 bremfree(bp); 2391 bp->b_flags |= B_INVAL | B_RELBUF; 2392 bp->b_flags &= ~B_ASYNC; 2393 brelse(bp); 2394 anyfreed = true; 2395 2396 BO_LOCK(bo); 2397 if (nbp != NULL && 2398 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 2399 (nbp->b_vp != vp) || 2400 (nbp->b_flags & B_DELWRI) == 0)) 2401 return (EAGAIN); 2402 } 2403 } while (anyfreed); 2404 return (0); 2405 } 2406 2407 static void 2408 buf_vlist_remove(struct buf *bp) 2409 { 2410 struct bufv *bv; 2411 b_xflags_t flags; 2412 2413 flags = bp->b_xflags; 2414 2415 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2416 ASSERT_BO_WLOCKED(bp->b_bufobj); 2417 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2418 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2419 ("%s: buffer %p has invalid queue state", __func__, bp)); 2420 2421 if ((flags & BX_VNDIRTY) != 0) 2422 bv = &bp->b_bufobj->bo_dirty; 2423 else 2424 bv = &bp->b_bufobj->bo_clean; 2425 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2426 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2427 bv->bv_cnt--; 2428 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2429 } 2430 2431 /* 2432 * Add the buffer to the sorted clean or dirty block list. 2433 * 2434 * NOTE: xflags is passed as a constant, optimizing this inline function! 2435 */ 2436 static void 2437 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2438 { 2439 struct bufv *bv; 2440 struct buf *n; 2441 int error; 2442 2443 ASSERT_BO_WLOCKED(bo); 2444 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2445 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2446 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2447 ("dead bo %p", bo)); 2448 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2449 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2450 bp->b_xflags |= xflags; 2451 if (xflags & BX_VNDIRTY) 2452 bv = &bo->bo_dirty; 2453 else 2454 bv = &bo->bo_clean; 2455 2456 /* 2457 * Keep the list ordered. Optimize empty list insertion. Assume 2458 * we tend to grow at the tail so lookup_le should usually be cheaper 2459 * than _ge. 2460 */ 2461 if (bv->bv_cnt == 0 || 2462 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2463 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2464 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2465 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2466 else 2467 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2468 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2469 if (error) 2470 panic("buf_vlist_add: Preallocated nodes insufficient."); 2471 bv->bv_cnt++; 2472 } 2473 2474 /* 2475 * Look up a buffer using the buffer tries. 2476 */ 2477 struct buf * 2478 gbincore(struct bufobj *bo, daddr_t lblkno) 2479 { 2480 struct buf *bp; 2481 2482 ASSERT_BO_LOCKED(bo); 2483 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2484 if (bp != NULL) 2485 return (bp); 2486 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2487 } 2488 2489 /* 2490 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2491 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2492 * stability of the result. Like other lockless lookups, the found buf may 2493 * already be invalid by the time this function returns. 2494 */ 2495 struct buf * 2496 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2497 { 2498 struct buf *bp; 2499 2500 ASSERT_BO_UNLOCKED(bo); 2501 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2502 if (bp != NULL) 2503 return (bp); 2504 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2505 } 2506 2507 /* 2508 * Associate a buffer with a vnode. 2509 */ 2510 void 2511 bgetvp(struct vnode *vp, struct buf *bp) 2512 { 2513 struct bufobj *bo; 2514 2515 bo = &vp->v_bufobj; 2516 ASSERT_BO_WLOCKED(bo); 2517 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2518 2519 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2520 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2521 ("bgetvp: bp already attached! %p", bp)); 2522 2523 vhold(vp); 2524 bp->b_vp = vp; 2525 bp->b_bufobj = bo; 2526 /* 2527 * Insert onto list for new vnode. 2528 */ 2529 buf_vlist_add(bp, bo, BX_VNCLEAN); 2530 } 2531 2532 /* 2533 * Disassociate a buffer from a vnode. 2534 */ 2535 void 2536 brelvp(struct buf *bp) 2537 { 2538 struct bufobj *bo; 2539 struct vnode *vp; 2540 2541 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2542 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2543 2544 /* 2545 * Delete from old vnode list, if on one. 2546 */ 2547 vp = bp->b_vp; /* XXX */ 2548 bo = bp->b_bufobj; 2549 BO_LOCK(bo); 2550 buf_vlist_remove(bp); 2551 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2552 bo->bo_flag &= ~BO_ONWORKLST; 2553 mtx_lock(&sync_mtx); 2554 LIST_REMOVE(bo, bo_synclist); 2555 syncer_worklist_len--; 2556 mtx_unlock(&sync_mtx); 2557 } 2558 bp->b_vp = NULL; 2559 bp->b_bufobj = NULL; 2560 BO_UNLOCK(bo); 2561 vdrop(vp); 2562 } 2563 2564 /* 2565 * Add an item to the syncer work queue. 2566 */ 2567 static void 2568 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2569 { 2570 int slot; 2571 2572 ASSERT_BO_WLOCKED(bo); 2573 2574 mtx_lock(&sync_mtx); 2575 if (bo->bo_flag & BO_ONWORKLST) 2576 LIST_REMOVE(bo, bo_synclist); 2577 else { 2578 bo->bo_flag |= BO_ONWORKLST; 2579 syncer_worklist_len++; 2580 } 2581 2582 if (delay > syncer_maxdelay - 2) 2583 delay = syncer_maxdelay - 2; 2584 slot = (syncer_delayno + delay) & syncer_mask; 2585 2586 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2587 mtx_unlock(&sync_mtx); 2588 } 2589 2590 static int 2591 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2592 { 2593 int error, len; 2594 2595 mtx_lock(&sync_mtx); 2596 len = syncer_worklist_len - sync_vnode_count; 2597 mtx_unlock(&sync_mtx); 2598 error = SYSCTL_OUT(req, &len, sizeof(len)); 2599 return (error); 2600 } 2601 2602 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2603 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2604 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2605 2606 static struct proc *updateproc; 2607 static void sched_sync(void); 2608 static struct kproc_desc up_kp = { 2609 "syncer", 2610 sched_sync, 2611 &updateproc 2612 }; 2613 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2614 2615 static int 2616 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2617 { 2618 struct vnode *vp; 2619 struct mount *mp; 2620 2621 *bo = LIST_FIRST(slp); 2622 if (*bo == NULL) 2623 return (0); 2624 vp = bo2vnode(*bo); 2625 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2626 return (1); 2627 /* 2628 * We use vhold in case the vnode does not 2629 * successfully sync. vhold prevents the vnode from 2630 * going away when we unlock the sync_mtx so that 2631 * we can acquire the vnode interlock. 2632 */ 2633 vholdl(vp); 2634 mtx_unlock(&sync_mtx); 2635 VI_UNLOCK(vp); 2636 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2637 vdrop(vp); 2638 mtx_lock(&sync_mtx); 2639 return (*bo == LIST_FIRST(slp)); 2640 } 2641 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2642 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2643 VOP_UNLOCK(vp); 2644 vn_finished_write(mp); 2645 BO_LOCK(*bo); 2646 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2647 /* 2648 * Put us back on the worklist. The worklist 2649 * routine will remove us from our current 2650 * position and then add us back in at a later 2651 * position. 2652 */ 2653 vn_syncer_add_to_worklist(*bo, syncdelay); 2654 } 2655 BO_UNLOCK(*bo); 2656 vdrop(vp); 2657 mtx_lock(&sync_mtx); 2658 return (0); 2659 } 2660 2661 static int first_printf = 1; 2662 2663 /* 2664 * System filesystem synchronizer daemon. 2665 */ 2666 static void 2667 sched_sync(void) 2668 { 2669 struct synclist *next, *slp; 2670 struct bufobj *bo; 2671 long starttime; 2672 struct thread *td = curthread; 2673 int last_work_seen; 2674 int net_worklist_len; 2675 int syncer_final_iter; 2676 int error; 2677 2678 last_work_seen = 0; 2679 syncer_final_iter = 0; 2680 syncer_state = SYNCER_RUNNING; 2681 starttime = time_uptime; 2682 td->td_pflags |= TDP_NORUNNINGBUF; 2683 2684 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2685 SHUTDOWN_PRI_LAST); 2686 2687 mtx_lock(&sync_mtx); 2688 for (;;) { 2689 if (syncer_state == SYNCER_FINAL_DELAY && 2690 syncer_final_iter == 0) { 2691 mtx_unlock(&sync_mtx); 2692 kproc_suspend_check(td->td_proc); 2693 mtx_lock(&sync_mtx); 2694 } 2695 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2696 if (syncer_state != SYNCER_RUNNING && 2697 starttime != time_uptime) { 2698 if (first_printf) { 2699 printf("\nSyncing disks, vnodes remaining... "); 2700 first_printf = 0; 2701 } 2702 printf("%d ", net_worklist_len); 2703 } 2704 starttime = time_uptime; 2705 2706 /* 2707 * Push files whose dirty time has expired. Be careful 2708 * of interrupt race on slp queue. 2709 * 2710 * Skip over empty worklist slots when shutting down. 2711 */ 2712 do { 2713 slp = &syncer_workitem_pending[syncer_delayno]; 2714 syncer_delayno += 1; 2715 if (syncer_delayno == syncer_maxdelay) 2716 syncer_delayno = 0; 2717 next = &syncer_workitem_pending[syncer_delayno]; 2718 /* 2719 * If the worklist has wrapped since the 2720 * it was emptied of all but syncer vnodes, 2721 * switch to the FINAL_DELAY state and run 2722 * for one more second. 2723 */ 2724 if (syncer_state == SYNCER_SHUTTING_DOWN && 2725 net_worklist_len == 0 && 2726 last_work_seen == syncer_delayno) { 2727 syncer_state = SYNCER_FINAL_DELAY; 2728 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2729 } 2730 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2731 syncer_worklist_len > 0); 2732 2733 /* 2734 * Keep track of the last time there was anything 2735 * on the worklist other than syncer vnodes. 2736 * Return to the SHUTTING_DOWN state if any 2737 * new work appears. 2738 */ 2739 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2740 last_work_seen = syncer_delayno; 2741 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2742 syncer_state = SYNCER_SHUTTING_DOWN; 2743 while (!LIST_EMPTY(slp)) { 2744 error = sync_vnode(slp, &bo, td); 2745 if (error == 1) { 2746 LIST_REMOVE(bo, bo_synclist); 2747 LIST_INSERT_HEAD(next, bo, bo_synclist); 2748 continue; 2749 } 2750 2751 if (first_printf == 0) { 2752 /* 2753 * Drop the sync mutex, because some watchdog 2754 * drivers need to sleep while patting 2755 */ 2756 mtx_unlock(&sync_mtx); 2757 wdog_kern_pat(WD_LASTVAL); 2758 mtx_lock(&sync_mtx); 2759 } 2760 } 2761 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2762 syncer_final_iter--; 2763 /* 2764 * The variable rushjob allows the kernel to speed up the 2765 * processing of the filesystem syncer process. A rushjob 2766 * value of N tells the filesystem syncer to process the next 2767 * N seconds worth of work on its queue ASAP. Currently rushjob 2768 * is used by the soft update code to speed up the filesystem 2769 * syncer process when the incore state is getting so far 2770 * ahead of the disk that the kernel memory pool is being 2771 * threatened with exhaustion. 2772 */ 2773 if (rushjob > 0) { 2774 rushjob -= 1; 2775 continue; 2776 } 2777 /* 2778 * Just sleep for a short period of time between 2779 * iterations when shutting down to allow some I/O 2780 * to happen. 2781 * 2782 * If it has taken us less than a second to process the 2783 * current work, then wait. Otherwise start right over 2784 * again. We can still lose time if any single round 2785 * takes more than two seconds, but it does not really 2786 * matter as we are just trying to generally pace the 2787 * filesystem activity. 2788 */ 2789 if (syncer_state != SYNCER_RUNNING || 2790 time_uptime == starttime) { 2791 thread_lock(td); 2792 sched_prio(td, PPAUSE); 2793 thread_unlock(td); 2794 } 2795 if (syncer_state != SYNCER_RUNNING) 2796 cv_timedwait(&sync_wakeup, &sync_mtx, 2797 hz / SYNCER_SHUTDOWN_SPEEDUP); 2798 else if (time_uptime == starttime) 2799 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2800 } 2801 } 2802 2803 /* 2804 * Request the syncer daemon to speed up its work. 2805 * We never push it to speed up more than half of its 2806 * normal turn time, otherwise it could take over the cpu. 2807 */ 2808 int 2809 speedup_syncer(void) 2810 { 2811 int ret = 0; 2812 2813 mtx_lock(&sync_mtx); 2814 if (rushjob < syncdelay / 2) { 2815 rushjob += 1; 2816 stat_rush_requests += 1; 2817 ret = 1; 2818 } 2819 mtx_unlock(&sync_mtx); 2820 cv_broadcast(&sync_wakeup); 2821 return (ret); 2822 } 2823 2824 /* 2825 * Tell the syncer to speed up its work and run though its work 2826 * list several times, then tell it to shut down. 2827 */ 2828 static void 2829 syncer_shutdown(void *arg, int howto) 2830 { 2831 2832 if (howto & RB_NOSYNC) 2833 return; 2834 mtx_lock(&sync_mtx); 2835 syncer_state = SYNCER_SHUTTING_DOWN; 2836 rushjob = 0; 2837 mtx_unlock(&sync_mtx); 2838 cv_broadcast(&sync_wakeup); 2839 kproc_shutdown(arg, howto); 2840 } 2841 2842 void 2843 syncer_suspend(void) 2844 { 2845 2846 syncer_shutdown(updateproc, 0); 2847 } 2848 2849 void 2850 syncer_resume(void) 2851 { 2852 2853 mtx_lock(&sync_mtx); 2854 first_printf = 1; 2855 syncer_state = SYNCER_RUNNING; 2856 mtx_unlock(&sync_mtx); 2857 cv_broadcast(&sync_wakeup); 2858 kproc_resume(updateproc); 2859 } 2860 2861 /* 2862 * Move the buffer between the clean and dirty lists of its vnode. 2863 */ 2864 void 2865 reassignbuf(struct buf *bp) 2866 { 2867 struct vnode *vp; 2868 struct bufobj *bo; 2869 int delay; 2870 #ifdef INVARIANTS 2871 struct bufv *bv; 2872 #endif 2873 2874 vp = bp->b_vp; 2875 bo = bp->b_bufobj; 2876 2877 KASSERT((bp->b_flags & B_PAGING) == 0, 2878 ("%s: cannot reassign paging buffer %p", __func__, bp)); 2879 2880 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2881 bp, bp->b_vp, bp->b_flags); 2882 2883 BO_LOCK(bo); 2884 buf_vlist_remove(bp); 2885 2886 /* 2887 * If dirty, put on list of dirty buffers; otherwise insert onto list 2888 * of clean buffers. 2889 */ 2890 if (bp->b_flags & B_DELWRI) { 2891 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2892 switch (vp->v_type) { 2893 case VDIR: 2894 delay = dirdelay; 2895 break; 2896 case VCHR: 2897 delay = metadelay; 2898 break; 2899 default: 2900 delay = filedelay; 2901 } 2902 vn_syncer_add_to_worklist(bo, delay); 2903 } 2904 buf_vlist_add(bp, bo, BX_VNDIRTY); 2905 } else { 2906 buf_vlist_add(bp, bo, BX_VNCLEAN); 2907 2908 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2909 mtx_lock(&sync_mtx); 2910 LIST_REMOVE(bo, bo_synclist); 2911 syncer_worklist_len--; 2912 mtx_unlock(&sync_mtx); 2913 bo->bo_flag &= ~BO_ONWORKLST; 2914 } 2915 } 2916 #ifdef INVARIANTS 2917 bv = &bo->bo_clean; 2918 bp = TAILQ_FIRST(&bv->bv_hd); 2919 KASSERT(bp == NULL || bp->b_bufobj == bo, 2920 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2921 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2922 KASSERT(bp == NULL || bp->b_bufobj == bo, 2923 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2924 bv = &bo->bo_dirty; 2925 bp = TAILQ_FIRST(&bv->bv_hd); 2926 KASSERT(bp == NULL || bp->b_bufobj == bo, 2927 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2928 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2929 KASSERT(bp == NULL || bp->b_bufobj == bo, 2930 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2931 #endif 2932 BO_UNLOCK(bo); 2933 } 2934 2935 static void 2936 v_init_counters(struct vnode *vp) 2937 { 2938 2939 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2940 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2941 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2942 2943 refcount_init(&vp->v_holdcnt, 1); 2944 refcount_init(&vp->v_usecount, 1); 2945 } 2946 2947 /* 2948 * Grab a particular vnode from the free list, increment its 2949 * reference count and lock it. VIRF_DOOMED is set if the vnode 2950 * is being destroyed. Only callers who specify LK_RETRY will 2951 * see doomed vnodes. If inactive processing was delayed in 2952 * vput try to do it here. 2953 * 2954 * usecount is manipulated using atomics without holding any locks. 2955 * 2956 * holdcnt can be manipulated using atomics without holding any locks, 2957 * except when transitioning 1<->0, in which case the interlock is held. 2958 * 2959 * Consumers which don't guarantee liveness of the vnode can use SMR to 2960 * try to get a reference. Note this operation can fail since the vnode 2961 * may be awaiting getting freed by the time they get to it. 2962 */ 2963 enum vgetstate 2964 vget_prep_smr(struct vnode *vp) 2965 { 2966 enum vgetstate vs; 2967 2968 VFS_SMR_ASSERT_ENTERED(); 2969 2970 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2971 vs = VGET_USECOUNT; 2972 } else { 2973 if (vhold_smr(vp)) 2974 vs = VGET_HOLDCNT; 2975 else 2976 vs = VGET_NONE; 2977 } 2978 return (vs); 2979 } 2980 2981 enum vgetstate 2982 vget_prep(struct vnode *vp) 2983 { 2984 enum vgetstate vs; 2985 2986 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2987 vs = VGET_USECOUNT; 2988 } else { 2989 vhold(vp); 2990 vs = VGET_HOLDCNT; 2991 } 2992 return (vs); 2993 } 2994 2995 void 2996 vget_abort(struct vnode *vp, enum vgetstate vs) 2997 { 2998 2999 switch (vs) { 3000 case VGET_USECOUNT: 3001 vrele(vp); 3002 break; 3003 case VGET_HOLDCNT: 3004 vdrop(vp); 3005 break; 3006 default: 3007 __assert_unreachable(); 3008 } 3009 } 3010 3011 int 3012 vget(struct vnode *vp, int flags) 3013 { 3014 enum vgetstate vs; 3015 3016 vs = vget_prep(vp); 3017 return (vget_finish(vp, flags, vs)); 3018 } 3019 3020 int 3021 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 3022 { 3023 int error; 3024 3025 if ((flags & LK_INTERLOCK) != 0) 3026 ASSERT_VI_LOCKED(vp, __func__); 3027 else 3028 ASSERT_VI_UNLOCKED(vp, __func__); 3029 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3030 VNPASS(vp->v_holdcnt > 0, vp); 3031 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3032 3033 error = vn_lock(vp, flags); 3034 if (__predict_false(error != 0)) { 3035 vget_abort(vp, vs); 3036 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 3037 vp); 3038 return (error); 3039 } 3040 3041 vget_finish_ref(vp, vs); 3042 return (0); 3043 } 3044 3045 void 3046 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 3047 { 3048 int old; 3049 3050 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3051 VNPASS(vp->v_holdcnt > 0, vp); 3052 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3053 3054 if (vs == VGET_USECOUNT) 3055 return; 3056 3057 /* 3058 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3059 * the vnode around. Otherwise someone else lended their hold count and 3060 * we have to drop ours. 3061 */ 3062 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3063 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3064 if (old != 0) { 3065 #ifdef INVARIANTS 3066 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3067 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3068 #else 3069 refcount_release(&vp->v_holdcnt); 3070 #endif 3071 } 3072 } 3073 3074 void 3075 vref(struct vnode *vp) 3076 { 3077 enum vgetstate vs; 3078 3079 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3080 vs = vget_prep(vp); 3081 vget_finish_ref(vp, vs); 3082 } 3083 3084 void 3085 vrefact(struct vnode *vp) 3086 { 3087 3088 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3089 #ifdef INVARIANTS 3090 int old = atomic_fetchadd_int(&vp->v_usecount, 1); 3091 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3092 #else 3093 refcount_acquire(&vp->v_usecount); 3094 #endif 3095 } 3096 3097 void 3098 vlazy(struct vnode *vp) 3099 { 3100 struct mount *mp; 3101 3102 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3103 3104 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3105 return; 3106 /* 3107 * We may get here for inactive routines after the vnode got doomed. 3108 */ 3109 if (VN_IS_DOOMED(vp)) 3110 return; 3111 mp = vp->v_mount; 3112 mtx_lock(&mp->mnt_listmtx); 3113 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3114 vp->v_mflag |= VMP_LAZYLIST; 3115 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3116 mp->mnt_lazyvnodelistsize++; 3117 } 3118 mtx_unlock(&mp->mnt_listmtx); 3119 } 3120 3121 static void 3122 vunlazy(struct vnode *vp) 3123 { 3124 struct mount *mp; 3125 3126 ASSERT_VI_LOCKED(vp, __func__); 3127 VNPASS(!VN_IS_DOOMED(vp), vp); 3128 3129 mp = vp->v_mount; 3130 mtx_lock(&mp->mnt_listmtx); 3131 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3132 /* 3133 * Don't remove the vnode from the lazy list if another thread 3134 * has increased the hold count. It may have re-enqueued the 3135 * vnode to the lazy list and is now responsible for its 3136 * removal. 3137 */ 3138 if (vp->v_holdcnt == 0) { 3139 vp->v_mflag &= ~VMP_LAZYLIST; 3140 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3141 mp->mnt_lazyvnodelistsize--; 3142 } 3143 mtx_unlock(&mp->mnt_listmtx); 3144 } 3145 3146 /* 3147 * This routine is only meant to be called from vgonel prior to dooming 3148 * the vnode. 3149 */ 3150 static void 3151 vunlazy_gone(struct vnode *vp) 3152 { 3153 struct mount *mp; 3154 3155 ASSERT_VOP_ELOCKED(vp, __func__); 3156 ASSERT_VI_LOCKED(vp, __func__); 3157 VNPASS(!VN_IS_DOOMED(vp), vp); 3158 3159 if (vp->v_mflag & VMP_LAZYLIST) { 3160 mp = vp->v_mount; 3161 mtx_lock(&mp->mnt_listmtx); 3162 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3163 vp->v_mflag &= ~VMP_LAZYLIST; 3164 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3165 mp->mnt_lazyvnodelistsize--; 3166 mtx_unlock(&mp->mnt_listmtx); 3167 } 3168 } 3169 3170 static void 3171 vdefer_inactive(struct vnode *vp) 3172 { 3173 3174 ASSERT_VI_LOCKED(vp, __func__); 3175 VNASSERT(vp->v_holdcnt > 0, vp, 3176 ("%s: vnode without hold count", __func__)); 3177 if (VN_IS_DOOMED(vp)) { 3178 vdropl(vp); 3179 return; 3180 } 3181 if (vp->v_iflag & VI_DEFINACT) { 3182 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 3183 vdropl(vp); 3184 return; 3185 } 3186 if (vp->v_usecount > 0) { 3187 vp->v_iflag &= ~VI_OWEINACT; 3188 vdropl(vp); 3189 return; 3190 } 3191 vlazy(vp); 3192 vp->v_iflag |= VI_DEFINACT; 3193 VI_UNLOCK(vp); 3194 counter_u64_add(deferred_inact, 1); 3195 } 3196 3197 static void 3198 vdefer_inactive_unlocked(struct vnode *vp) 3199 { 3200 3201 VI_LOCK(vp); 3202 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3203 vdropl(vp); 3204 return; 3205 } 3206 vdefer_inactive(vp); 3207 } 3208 3209 enum vput_op { VRELE, VPUT, VUNREF }; 3210 3211 /* 3212 * Handle ->v_usecount transitioning to 0. 3213 * 3214 * By releasing the last usecount we take ownership of the hold count which 3215 * provides liveness of the vnode, meaning we have to vdrop. 3216 * 3217 * For all vnodes we may need to perform inactive processing. It requires an 3218 * exclusive lock on the vnode, while it is legal to call here with only a 3219 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3220 * inactive processing gets deferred to the syncer. 3221 * 3222 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3223 * on the lock being held all the way until VOP_INACTIVE. This in particular 3224 * happens with UFS which adds half-constructed vnodes to the hash, where they 3225 * can be found by other code. 3226 */ 3227 static void 3228 vput_final(struct vnode *vp, enum vput_op func) 3229 { 3230 int error; 3231 bool want_unlock; 3232 3233 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3234 VNPASS(vp->v_holdcnt > 0, vp); 3235 3236 VI_LOCK(vp); 3237 3238 /* 3239 * By the time we got here someone else might have transitioned 3240 * the count back to > 0. 3241 */ 3242 if (vp->v_usecount > 0) 3243 goto out; 3244 3245 /* 3246 * If the vnode is doomed vgone already performed inactive processing 3247 * (if needed). 3248 */ 3249 if (VN_IS_DOOMED(vp)) 3250 goto out; 3251 3252 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3253 goto out; 3254 3255 if (vp->v_iflag & VI_DOINGINACT) 3256 goto out; 3257 3258 /* 3259 * Locking operations here will drop the interlock and possibly the 3260 * vnode lock, opening a window where the vnode can get doomed all the 3261 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3262 * perform inactive. 3263 */ 3264 vp->v_iflag |= VI_OWEINACT; 3265 want_unlock = false; 3266 error = 0; 3267 switch (func) { 3268 case VRELE: 3269 switch (VOP_ISLOCKED(vp)) { 3270 case LK_EXCLUSIVE: 3271 break; 3272 case LK_EXCLOTHER: 3273 case 0: 3274 want_unlock = true; 3275 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3276 VI_LOCK(vp); 3277 break; 3278 default: 3279 /* 3280 * The lock has at least one sharer, but we have no way 3281 * to conclude whether this is us. Play it safe and 3282 * defer processing. 3283 */ 3284 error = EAGAIN; 3285 break; 3286 } 3287 break; 3288 case VPUT: 3289 want_unlock = true; 3290 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3291 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3292 LK_NOWAIT); 3293 VI_LOCK(vp); 3294 } 3295 break; 3296 case VUNREF: 3297 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3298 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3299 VI_LOCK(vp); 3300 } 3301 break; 3302 } 3303 if (error == 0) { 3304 if (func == VUNREF) { 3305 VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, 3306 ("recursive vunref")); 3307 vp->v_vflag |= VV_UNREF; 3308 } 3309 for (;;) { 3310 error = vinactive(vp); 3311 if (want_unlock) 3312 VOP_UNLOCK(vp); 3313 if (error != ERELOOKUP || !want_unlock) 3314 break; 3315 VOP_LOCK(vp, LK_EXCLUSIVE); 3316 } 3317 if (func == VUNREF) 3318 vp->v_vflag &= ~VV_UNREF; 3319 vdropl(vp); 3320 } else { 3321 vdefer_inactive(vp); 3322 } 3323 return; 3324 out: 3325 if (func == VPUT) 3326 VOP_UNLOCK(vp); 3327 vdropl(vp); 3328 } 3329 3330 /* 3331 * Decrement ->v_usecount for a vnode. 3332 * 3333 * Releasing the last use count requires additional processing, see vput_final 3334 * above for details. 3335 * 3336 * Comment above each variant denotes lock state on entry and exit. 3337 */ 3338 3339 /* 3340 * in: any 3341 * out: same as passed in 3342 */ 3343 void 3344 vrele(struct vnode *vp) 3345 { 3346 3347 ASSERT_VI_UNLOCKED(vp, __func__); 3348 if (!refcount_release(&vp->v_usecount)) 3349 return; 3350 vput_final(vp, VRELE); 3351 } 3352 3353 /* 3354 * in: locked 3355 * out: unlocked 3356 */ 3357 void 3358 vput(struct vnode *vp) 3359 { 3360 3361 ASSERT_VOP_LOCKED(vp, __func__); 3362 ASSERT_VI_UNLOCKED(vp, __func__); 3363 if (!refcount_release(&vp->v_usecount)) { 3364 VOP_UNLOCK(vp); 3365 return; 3366 } 3367 vput_final(vp, VPUT); 3368 } 3369 3370 /* 3371 * in: locked 3372 * out: locked 3373 */ 3374 void 3375 vunref(struct vnode *vp) 3376 { 3377 3378 ASSERT_VOP_LOCKED(vp, __func__); 3379 ASSERT_VI_UNLOCKED(vp, __func__); 3380 if (!refcount_release(&vp->v_usecount)) 3381 return; 3382 vput_final(vp, VUNREF); 3383 } 3384 3385 void 3386 vhold(struct vnode *vp) 3387 { 3388 int old; 3389 3390 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3391 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3392 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3393 ("%s: wrong hold count %d", __func__, old)); 3394 if (old == 0) 3395 vfs_freevnodes_dec(); 3396 } 3397 3398 void 3399 vholdnz(struct vnode *vp) 3400 { 3401 3402 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3403 #ifdef INVARIANTS 3404 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3405 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3406 ("%s: wrong hold count %d", __func__, old)); 3407 #else 3408 atomic_add_int(&vp->v_holdcnt, 1); 3409 #endif 3410 } 3411 3412 /* 3413 * Grab a hold count unless the vnode is freed. 3414 * 3415 * Only use this routine if vfs smr is the only protection you have against 3416 * freeing the vnode. 3417 * 3418 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3419 * is not set. After the flag is set the vnode becomes immutable to anyone but 3420 * the thread which managed to set the flag. 3421 * 3422 * It may be tempting to replace the loop with: 3423 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3424 * if (count & VHOLD_NO_SMR) { 3425 * backpedal and error out; 3426 * } 3427 * 3428 * However, while this is more performant, it hinders debugging by eliminating 3429 * the previously mentioned invariant. 3430 */ 3431 bool 3432 vhold_smr(struct vnode *vp) 3433 { 3434 int count; 3435 3436 VFS_SMR_ASSERT_ENTERED(); 3437 3438 count = atomic_load_int(&vp->v_holdcnt); 3439 for (;;) { 3440 if (count & VHOLD_NO_SMR) { 3441 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3442 ("non-zero hold count with flags %d\n", count)); 3443 return (false); 3444 } 3445 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3446 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3447 if (count == 0) 3448 vfs_freevnodes_dec(); 3449 return (true); 3450 } 3451 } 3452 } 3453 3454 /* 3455 * Hold a free vnode for recycling. 3456 * 3457 * Note: vnode_init references this comment. 3458 * 3459 * Attempts to recycle only need the global vnode list lock and have no use for 3460 * SMR. 3461 * 3462 * However, vnodes get inserted into the global list before they get fully 3463 * initialized and stay there until UMA decides to free the memory. This in 3464 * particular means the target can be found before it becomes usable and after 3465 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to 3466 * VHOLD_NO_SMR. 3467 * 3468 * Note: the vnode may gain more references after we transition the count 0->1. 3469 */ 3470 static bool 3471 vhold_recycle_free(struct vnode *vp) 3472 { 3473 int count; 3474 3475 mtx_assert(&vnode_list_mtx, MA_OWNED); 3476 3477 count = atomic_load_int(&vp->v_holdcnt); 3478 for (;;) { 3479 if (count & VHOLD_NO_SMR) { 3480 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3481 ("non-zero hold count with flags %d\n", count)); 3482 return (false); 3483 } 3484 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3485 if (count > 0) { 3486 return (false); 3487 } 3488 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3489 vfs_freevnodes_dec(); 3490 return (true); 3491 } 3492 } 3493 } 3494 3495 static void __noinline 3496 vdbatch_process(struct vdbatch *vd) 3497 { 3498 struct vnode *vp; 3499 int i; 3500 3501 mtx_assert(&vd->lock, MA_OWNED); 3502 MPASS(curthread->td_pinned > 0); 3503 MPASS(vd->index == VDBATCH_SIZE); 3504 3505 mtx_lock(&vnode_list_mtx); 3506 critical_enter(); 3507 freevnodes += vd->freevnodes; 3508 for (i = 0; i < VDBATCH_SIZE; i++) { 3509 vp = vd->tab[i]; 3510 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3511 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3512 MPASS(vp->v_dbatchcpu != NOCPU); 3513 vp->v_dbatchcpu = NOCPU; 3514 } 3515 mtx_unlock(&vnode_list_mtx); 3516 vd->freevnodes = 0; 3517 bzero(vd->tab, sizeof(vd->tab)); 3518 vd->index = 0; 3519 critical_exit(); 3520 } 3521 3522 static void 3523 vdbatch_enqueue(struct vnode *vp) 3524 { 3525 struct vdbatch *vd; 3526 3527 ASSERT_VI_LOCKED(vp, __func__); 3528 VNASSERT(!VN_IS_DOOMED(vp), vp, 3529 ("%s: deferring requeue of a doomed vnode", __func__)); 3530 3531 if (vp->v_dbatchcpu != NOCPU) { 3532 VI_UNLOCK(vp); 3533 return; 3534 } 3535 3536 sched_pin(); 3537 vd = DPCPU_PTR(vd); 3538 mtx_lock(&vd->lock); 3539 MPASS(vd->index < VDBATCH_SIZE); 3540 MPASS(vd->tab[vd->index] == NULL); 3541 /* 3542 * A hack: we depend on being pinned so that we know what to put in 3543 * ->v_dbatchcpu. 3544 */ 3545 vp->v_dbatchcpu = curcpu; 3546 vd->tab[vd->index] = vp; 3547 vd->index++; 3548 VI_UNLOCK(vp); 3549 if (vd->index == VDBATCH_SIZE) 3550 vdbatch_process(vd); 3551 mtx_unlock(&vd->lock); 3552 sched_unpin(); 3553 } 3554 3555 /* 3556 * This routine must only be called for vnodes which are about to be 3557 * deallocated. Supporting dequeue for arbitrary vndoes would require 3558 * validating that the locked batch matches. 3559 */ 3560 static void 3561 vdbatch_dequeue(struct vnode *vp) 3562 { 3563 struct vdbatch *vd; 3564 int i; 3565 short cpu; 3566 3567 VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, 3568 ("%s: called for a used vnode\n", __func__)); 3569 3570 cpu = vp->v_dbatchcpu; 3571 if (cpu == NOCPU) 3572 return; 3573 3574 vd = DPCPU_ID_PTR(cpu, vd); 3575 mtx_lock(&vd->lock); 3576 for (i = 0; i < vd->index; i++) { 3577 if (vd->tab[i] != vp) 3578 continue; 3579 vp->v_dbatchcpu = NOCPU; 3580 vd->index--; 3581 vd->tab[i] = vd->tab[vd->index]; 3582 vd->tab[vd->index] = NULL; 3583 break; 3584 } 3585 mtx_unlock(&vd->lock); 3586 /* 3587 * Either we dequeued the vnode above or the target CPU beat us to it. 3588 */ 3589 MPASS(vp->v_dbatchcpu == NOCPU); 3590 } 3591 3592 /* 3593 * Drop the hold count of the vnode. If this is the last reference to 3594 * the vnode we place it on the free list unless it has been vgone'd 3595 * (marked VIRF_DOOMED) in which case we will free it. 3596 * 3597 * Because the vnode vm object keeps a hold reference on the vnode if 3598 * there is at least one resident non-cached page, the vnode cannot 3599 * leave the active list without the page cleanup done. 3600 */ 3601 static void __noinline 3602 vdropl_final(struct vnode *vp) 3603 { 3604 3605 ASSERT_VI_LOCKED(vp, __func__); 3606 VNPASS(VN_IS_DOOMED(vp), vp); 3607 /* 3608 * Set the VHOLD_NO_SMR flag. 3609 * 3610 * We may be racing against vhold_smr. If they win we can just pretend 3611 * we never got this far, they will vdrop later. 3612 */ 3613 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { 3614 vfs_freevnodes_inc(); 3615 VI_UNLOCK(vp); 3616 /* 3617 * We lost the aforementioned race. Any subsequent access is 3618 * invalid as they might have managed to vdropl on their own. 3619 */ 3620 return; 3621 } 3622 /* 3623 * Don't bump freevnodes as this one is going away. 3624 */ 3625 freevnode(vp); 3626 } 3627 3628 void 3629 vdrop(struct vnode *vp) 3630 { 3631 3632 ASSERT_VI_UNLOCKED(vp, __func__); 3633 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3634 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3635 return; 3636 VI_LOCK(vp); 3637 vdropl(vp); 3638 } 3639 3640 static void __always_inline 3641 vdropl_impl(struct vnode *vp, bool enqueue) 3642 { 3643 3644 ASSERT_VI_LOCKED(vp, __func__); 3645 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3646 if (!refcount_release(&vp->v_holdcnt)) { 3647 VI_UNLOCK(vp); 3648 return; 3649 } 3650 VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); 3651 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 3652 if (VN_IS_DOOMED(vp)) { 3653 vdropl_final(vp); 3654 return; 3655 } 3656 3657 vfs_freevnodes_inc(); 3658 if (vp->v_mflag & VMP_LAZYLIST) { 3659 vunlazy(vp); 3660 } 3661 3662 if (!enqueue) { 3663 VI_UNLOCK(vp); 3664 return; 3665 } 3666 3667 /* 3668 * Also unlocks the interlock. We can't assert on it as we 3669 * released our hold and by now the vnode might have been 3670 * freed. 3671 */ 3672 vdbatch_enqueue(vp); 3673 } 3674 3675 void 3676 vdropl(struct vnode *vp) 3677 { 3678 3679 vdropl_impl(vp, true); 3680 } 3681 3682 /* 3683 * vdrop a vnode when recycling 3684 * 3685 * This is a special case routine only to be used when recycling, differs from 3686 * regular vdrop by not requeieing the vnode on LRU. 3687 * 3688 * Consider a case where vtryrecycle continuously fails with all vnodes (due to 3689 * e.g., frozen writes on the filesystem), filling the batch and causing it to 3690 * be requeued. Then vnlru will end up revisiting the same vnodes. This is a 3691 * loop which can last for as long as writes are frozen. 3692 */ 3693 static void 3694 vdropl_recycle(struct vnode *vp) 3695 { 3696 3697 vdropl_impl(vp, false); 3698 } 3699 3700 static void 3701 vdrop_recycle(struct vnode *vp) 3702 { 3703 3704 VI_LOCK(vp); 3705 vdropl_recycle(vp); 3706 } 3707 3708 /* 3709 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3710 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3711 */ 3712 static int 3713 vinactivef(struct vnode *vp) 3714 { 3715 struct vm_object *obj; 3716 int error; 3717 3718 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3719 ASSERT_VI_LOCKED(vp, "vinactive"); 3720 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 3721 ("vinactive: recursed on VI_DOINGINACT")); 3722 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3723 vp->v_iflag |= VI_DOINGINACT; 3724 vp->v_iflag &= ~VI_OWEINACT; 3725 VI_UNLOCK(vp); 3726 /* 3727 * Before moving off the active list, we must be sure that any 3728 * modified pages are converted into the vnode's dirty 3729 * buffers, since these will no longer be checked once the 3730 * vnode is on the inactive list. 3731 * 3732 * The write-out of the dirty pages is asynchronous. At the 3733 * point that VOP_INACTIVE() is called, there could still be 3734 * pending I/O and dirty pages in the object. 3735 */ 3736 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 3737 vm_object_mightbedirty(obj)) { 3738 VM_OBJECT_WLOCK(obj); 3739 vm_object_page_clean(obj, 0, 0, 0); 3740 VM_OBJECT_WUNLOCK(obj); 3741 } 3742 error = VOP_INACTIVE(vp); 3743 VI_LOCK(vp); 3744 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 3745 ("vinactive: lost VI_DOINGINACT")); 3746 vp->v_iflag &= ~VI_DOINGINACT; 3747 return (error); 3748 } 3749 3750 int 3751 vinactive(struct vnode *vp) 3752 { 3753 3754 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3755 ASSERT_VI_LOCKED(vp, "vinactive"); 3756 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3757 3758 if ((vp->v_iflag & VI_OWEINACT) == 0) 3759 return (0); 3760 if (vp->v_iflag & VI_DOINGINACT) 3761 return (0); 3762 if (vp->v_usecount > 0) { 3763 vp->v_iflag &= ~VI_OWEINACT; 3764 return (0); 3765 } 3766 return (vinactivef(vp)); 3767 } 3768 3769 /* 3770 * Remove any vnodes in the vnode table belonging to mount point mp. 3771 * 3772 * If FORCECLOSE is not specified, there should not be any active ones, 3773 * return error if any are found (nb: this is a user error, not a 3774 * system error). If FORCECLOSE is specified, detach any active vnodes 3775 * that are found. 3776 * 3777 * If WRITECLOSE is set, only flush out regular file vnodes open for 3778 * writing. 3779 * 3780 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 3781 * 3782 * `rootrefs' specifies the base reference count for the root vnode 3783 * of this filesystem. The root vnode is considered busy if its 3784 * v_usecount exceeds this value. On a successful return, vflush(, td) 3785 * will call vrele() on the root vnode exactly rootrefs times. 3786 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 3787 * be zero. 3788 */ 3789 #ifdef DIAGNOSTIC 3790 static int busyprt = 0; /* print out busy vnodes */ 3791 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 3792 #endif 3793 3794 int 3795 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 3796 { 3797 struct vnode *vp, *mvp, *rootvp = NULL; 3798 struct vattr vattr; 3799 int busy = 0, error; 3800 3801 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 3802 rootrefs, flags); 3803 if (rootrefs > 0) { 3804 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 3805 ("vflush: bad args")); 3806 /* 3807 * Get the filesystem root vnode. We can vput() it 3808 * immediately, since with rootrefs > 0, it won't go away. 3809 */ 3810 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 3811 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 3812 __func__, error); 3813 return (error); 3814 } 3815 vput(rootvp); 3816 } 3817 loop: 3818 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3819 vholdl(vp); 3820 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 3821 if (error) { 3822 vdrop(vp); 3823 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3824 goto loop; 3825 } 3826 /* 3827 * Skip over a vnodes marked VV_SYSTEM. 3828 */ 3829 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 3830 VOP_UNLOCK(vp); 3831 vdrop(vp); 3832 continue; 3833 } 3834 /* 3835 * If WRITECLOSE is set, flush out unlinked but still open 3836 * files (even if open only for reading) and regular file 3837 * vnodes open for writing. 3838 */ 3839 if (flags & WRITECLOSE) { 3840 if (vp->v_object != NULL) { 3841 VM_OBJECT_WLOCK(vp->v_object); 3842 vm_object_page_clean(vp->v_object, 0, 0, 0); 3843 VM_OBJECT_WUNLOCK(vp->v_object); 3844 } 3845 do { 3846 error = VOP_FSYNC(vp, MNT_WAIT, td); 3847 } while (error == ERELOOKUP); 3848 if (error != 0) { 3849 VOP_UNLOCK(vp); 3850 vdrop(vp); 3851 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3852 return (error); 3853 } 3854 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3855 VI_LOCK(vp); 3856 3857 if ((vp->v_type == VNON || 3858 (error == 0 && vattr.va_nlink > 0)) && 3859 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 3860 VOP_UNLOCK(vp); 3861 vdropl(vp); 3862 continue; 3863 } 3864 } else 3865 VI_LOCK(vp); 3866 /* 3867 * With v_usecount == 0, all we need to do is clear out the 3868 * vnode data structures and we are done. 3869 * 3870 * If FORCECLOSE is set, forcibly close the vnode. 3871 */ 3872 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3873 vgonel(vp); 3874 } else { 3875 busy++; 3876 #ifdef DIAGNOSTIC 3877 if (busyprt) 3878 vn_printf(vp, "vflush: busy vnode "); 3879 #endif 3880 } 3881 VOP_UNLOCK(vp); 3882 vdropl(vp); 3883 } 3884 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3885 /* 3886 * If just the root vnode is busy, and if its refcount 3887 * is equal to `rootrefs', then go ahead and kill it. 3888 */ 3889 VI_LOCK(rootvp); 3890 KASSERT(busy > 0, ("vflush: not busy")); 3891 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3892 ("vflush: usecount %d < rootrefs %d", 3893 rootvp->v_usecount, rootrefs)); 3894 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3895 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3896 vgone(rootvp); 3897 VOP_UNLOCK(rootvp); 3898 busy = 0; 3899 } else 3900 VI_UNLOCK(rootvp); 3901 } 3902 if (busy) { 3903 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3904 busy); 3905 return (EBUSY); 3906 } 3907 for (; rootrefs > 0; rootrefs--) 3908 vrele(rootvp); 3909 return (0); 3910 } 3911 3912 /* 3913 * Recycle an unused vnode to the front of the free list. 3914 */ 3915 int 3916 vrecycle(struct vnode *vp) 3917 { 3918 int recycled; 3919 3920 VI_LOCK(vp); 3921 recycled = vrecyclel(vp); 3922 VI_UNLOCK(vp); 3923 return (recycled); 3924 } 3925 3926 /* 3927 * vrecycle, with the vp interlock held. 3928 */ 3929 int 3930 vrecyclel(struct vnode *vp) 3931 { 3932 int recycled; 3933 3934 ASSERT_VOP_ELOCKED(vp, __func__); 3935 ASSERT_VI_LOCKED(vp, __func__); 3936 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3937 recycled = 0; 3938 if (vp->v_usecount == 0) { 3939 recycled = 1; 3940 vgonel(vp); 3941 } 3942 return (recycled); 3943 } 3944 3945 /* 3946 * Eliminate all activity associated with a vnode 3947 * in preparation for reuse. 3948 */ 3949 void 3950 vgone(struct vnode *vp) 3951 { 3952 VI_LOCK(vp); 3953 vgonel(vp); 3954 VI_UNLOCK(vp); 3955 } 3956 3957 /* 3958 * Notify upper mounts about reclaimed or unlinked vnode. 3959 */ 3960 void 3961 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) 3962 { 3963 struct mount *mp; 3964 struct mount_upper_node *ump; 3965 3966 mp = atomic_load_ptr(&vp->v_mount); 3967 if (mp == NULL) 3968 return; 3969 if (TAILQ_EMPTY(&mp->mnt_notify)) 3970 return; 3971 3972 MNT_ILOCK(mp); 3973 mp->mnt_upper_pending++; 3974 KASSERT(mp->mnt_upper_pending > 0, 3975 ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); 3976 TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { 3977 MNT_IUNLOCK(mp); 3978 switch (event) { 3979 case VFS_NOTIFY_UPPER_RECLAIM: 3980 VFS_RECLAIM_LOWERVP(ump->mp, vp); 3981 break; 3982 case VFS_NOTIFY_UPPER_UNLINK: 3983 VFS_UNLINK_LOWERVP(ump->mp, vp); 3984 break; 3985 } 3986 MNT_ILOCK(mp); 3987 } 3988 mp->mnt_upper_pending--; 3989 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 3990 mp->mnt_upper_pending == 0) { 3991 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 3992 wakeup(&mp->mnt_uppers); 3993 } 3994 MNT_IUNLOCK(mp); 3995 } 3996 3997 /* 3998 * vgone, with the vp interlock held. 3999 */ 4000 static void 4001 vgonel(struct vnode *vp) 4002 { 4003 struct thread *td; 4004 struct mount *mp; 4005 vm_object_t object; 4006 bool active, doinginact, oweinact; 4007 4008 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4009 ASSERT_VI_LOCKED(vp, "vgonel"); 4010 VNASSERT(vp->v_holdcnt, vp, 4011 ("vgonel: vp %p has no reference.", vp)); 4012 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4013 td = curthread; 4014 4015 /* 4016 * Don't vgonel if we're already doomed. 4017 */ 4018 if (VN_IS_DOOMED(vp)) 4019 return; 4020 /* 4021 * Paired with freevnode. 4022 */ 4023 vn_seqc_write_begin_locked(vp); 4024 vunlazy_gone(vp); 4025 vn_irflag_set_locked(vp, VIRF_DOOMED); 4026 4027 /* 4028 * Check to see if the vnode is in use. If so, we have to 4029 * call VOP_CLOSE() and VOP_INACTIVE(). 4030 * 4031 * It could be that VOP_INACTIVE() requested reclamation, in 4032 * which case we should avoid recursion, so check 4033 * VI_DOINGINACT. This is not precise but good enough. 4034 */ 4035 active = vp->v_usecount > 0; 4036 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4037 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; 4038 4039 /* 4040 * If we need to do inactive VI_OWEINACT will be set. 4041 */ 4042 if (vp->v_iflag & VI_DEFINACT) { 4043 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4044 vp->v_iflag &= ~VI_DEFINACT; 4045 vdropl(vp); 4046 } else { 4047 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4048 VI_UNLOCK(vp); 4049 } 4050 cache_purge_vgone(vp); 4051 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4052 4053 /* 4054 * If purging an active vnode, it must be closed and 4055 * deactivated before being reclaimed. 4056 */ 4057 if (active) 4058 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4059 if (!doinginact) { 4060 do { 4061 if (oweinact || active) { 4062 VI_LOCK(vp); 4063 vinactivef(vp); 4064 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4065 VI_UNLOCK(vp); 4066 } 4067 } while (oweinact); 4068 } 4069 if (vp->v_type == VSOCK) 4070 vfs_unp_reclaim(vp); 4071 4072 /* 4073 * Clean out any buffers associated with the vnode. 4074 * If the flush fails, just toss the buffers. 4075 */ 4076 mp = NULL; 4077 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4078 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4079 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4080 while (vinvalbuf(vp, 0, 0, 0) != 0) 4081 ; 4082 } 4083 4084 BO_LOCK(&vp->v_bufobj); 4085 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4086 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4087 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4088 vp->v_bufobj.bo_clean.bv_cnt == 0, 4089 ("vp %p bufobj not invalidated", vp)); 4090 4091 /* 4092 * For VMIO bufobj, BO_DEAD is set later, or in 4093 * vm_object_terminate() after the object's page queue is 4094 * flushed. 4095 */ 4096 object = vp->v_bufobj.bo_object; 4097 if (object == NULL) 4098 vp->v_bufobj.bo_flag |= BO_DEAD; 4099 BO_UNLOCK(&vp->v_bufobj); 4100 4101 /* 4102 * Handle the VM part. Tmpfs handles v_object on its own (the 4103 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4104 * should not touch the object borrowed from the lower vnode 4105 * (the handle check). 4106 */ 4107 if (object != NULL && object->type == OBJT_VNODE && 4108 object->handle == vp) 4109 vnode_destroy_vobject(vp); 4110 4111 /* 4112 * Reclaim the vnode. 4113 */ 4114 if (VOP_RECLAIM(vp)) 4115 panic("vgone: cannot reclaim"); 4116 if (mp != NULL) 4117 vn_finished_secondary_write(mp); 4118 VNASSERT(vp->v_object == NULL, vp, 4119 ("vop_reclaim left v_object vp=%p", vp)); 4120 /* 4121 * Clear the advisory locks and wake up waiting threads. 4122 */ 4123 if (vp->v_lockf != NULL) { 4124 (void)VOP_ADVLOCKPURGE(vp); 4125 vp->v_lockf = NULL; 4126 } 4127 /* 4128 * Delete from old mount point vnode list. 4129 */ 4130 if (vp->v_mount == NULL) { 4131 VI_LOCK(vp); 4132 } else { 4133 delmntque(vp); 4134 ASSERT_VI_LOCKED(vp, "vgonel 2"); 4135 } 4136 /* 4137 * Done with purge, reset to the standard lock and invalidate 4138 * the vnode. 4139 */ 4140 vp->v_vnlock = &vp->v_lock; 4141 vp->v_op = &dead_vnodeops; 4142 vp->v_type = VBAD; 4143 } 4144 4145 /* 4146 * Print out a description of a vnode. 4147 */ 4148 static const char * const typename[] = 4149 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 4150 "VMARKER"}; 4151 4152 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4153 "new hold count flag not added to vn_printf"); 4154 4155 void 4156 vn_printf(struct vnode *vp, const char *fmt, ...) 4157 { 4158 va_list ap; 4159 char buf[256], buf2[16]; 4160 u_long flags; 4161 u_int holdcnt; 4162 short irflag; 4163 4164 va_start(ap, fmt); 4165 vprintf(fmt, ap); 4166 va_end(ap); 4167 printf("%p: ", (void *)vp); 4168 printf("type %s\n", typename[vp->v_type]); 4169 holdcnt = atomic_load_int(&vp->v_holdcnt); 4170 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4171 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4172 vp->v_seqc_users); 4173 switch (vp->v_type) { 4174 case VDIR: 4175 printf(" mountedhere %p\n", vp->v_mountedhere); 4176 break; 4177 case VCHR: 4178 printf(" rdev %p\n", vp->v_rdev); 4179 break; 4180 case VSOCK: 4181 printf(" socket %p\n", vp->v_unpcb); 4182 break; 4183 case VFIFO: 4184 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4185 break; 4186 default: 4187 printf("\n"); 4188 break; 4189 } 4190 buf[0] = '\0'; 4191 buf[1] = '\0'; 4192 if (holdcnt & VHOLD_NO_SMR) 4193 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4194 printf(" hold count flags (%s)\n", buf + 1); 4195 4196 buf[0] = '\0'; 4197 buf[1] = '\0'; 4198 irflag = vn_irflag_read(vp); 4199 if (irflag & VIRF_DOOMED) 4200 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4201 if (irflag & VIRF_PGREAD) 4202 strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); 4203 if (irflag & VIRF_MOUNTPOINT) 4204 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); 4205 if (irflag & VIRF_TEXT_REF) 4206 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); 4207 flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); 4208 if (flags != 0) { 4209 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4210 strlcat(buf, buf2, sizeof(buf)); 4211 } 4212 if (vp->v_vflag & VV_ROOT) 4213 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4214 if (vp->v_vflag & VV_ISTTY) 4215 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4216 if (vp->v_vflag & VV_NOSYNC) 4217 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4218 if (vp->v_vflag & VV_ETERNALDEV) 4219 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4220 if (vp->v_vflag & VV_CACHEDLABEL) 4221 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4222 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4223 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4224 if (vp->v_vflag & VV_COPYONWRITE) 4225 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4226 if (vp->v_vflag & VV_SYSTEM) 4227 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4228 if (vp->v_vflag & VV_PROCDEP) 4229 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4230 if (vp->v_vflag & VV_DELETED) 4231 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4232 if (vp->v_vflag & VV_MD) 4233 strlcat(buf, "|VV_MD", sizeof(buf)); 4234 if (vp->v_vflag & VV_FORCEINSMQ) 4235 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4236 if (vp->v_vflag & VV_READLINK) 4237 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4238 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4239 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | 4240 VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); 4241 if (flags != 0) { 4242 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4243 strlcat(buf, buf2, sizeof(buf)); 4244 } 4245 if (vp->v_iflag & VI_MOUNT) 4246 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4247 if (vp->v_iflag & VI_DOINGINACT) 4248 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4249 if (vp->v_iflag & VI_OWEINACT) 4250 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4251 if (vp->v_iflag & VI_DEFINACT) 4252 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4253 if (vp->v_iflag & VI_FOPENING) 4254 strlcat(buf, "|VI_FOPENING", sizeof(buf)); 4255 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | 4256 VI_OWEINACT | VI_DEFINACT | VI_FOPENING); 4257 if (flags != 0) { 4258 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4259 strlcat(buf, buf2, sizeof(buf)); 4260 } 4261 if (vp->v_mflag & VMP_LAZYLIST) 4262 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4263 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4264 if (flags != 0) { 4265 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4266 strlcat(buf, buf2, sizeof(buf)); 4267 } 4268 printf(" flags (%s)", buf + 1); 4269 if (mtx_owned(VI_MTX(vp))) 4270 printf(" VI_LOCKed"); 4271 printf("\n"); 4272 if (vp->v_object != NULL) 4273 printf(" v_object %p ref %d pages %d " 4274 "cleanbuf %d dirtybuf %d\n", 4275 vp->v_object, vp->v_object->ref_count, 4276 vp->v_object->resident_page_count, 4277 vp->v_bufobj.bo_clean.bv_cnt, 4278 vp->v_bufobj.bo_dirty.bv_cnt); 4279 printf(" "); 4280 lockmgr_printinfo(vp->v_vnlock); 4281 if (vp->v_data != NULL) 4282 VOP_PRINT(vp); 4283 } 4284 4285 #ifdef DDB 4286 /* 4287 * List all of the locked vnodes in the system. 4288 * Called when debugging the kernel. 4289 */ 4290 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) 4291 { 4292 struct mount *mp; 4293 struct vnode *vp; 4294 4295 /* 4296 * Note: because this is DDB, we can't obey the locking semantics 4297 * for these structures, which means we could catch an inconsistent 4298 * state and dereference a nasty pointer. Not much to be done 4299 * about that. 4300 */ 4301 db_printf("Locked vnodes\n"); 4302 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4303 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4304 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4305 vn_printf(vp, "vnode "); 4306 } 4307 } 4308 } 4309 4310 /* 4311 * Show details about the given vnode. 4312 */ 4313 DB_SHOW_COMMAND(vnode, db_show_vnode) 4314 { 4315 struct vnode *vp; 4316 4317 if (!have_addr) 4318 return; 4319 vp = (struct vnode *)addr; 4320 vn_printf(vp, "vnode "); 4321 } 4322 4323 /* 4324 * Show details about the given mount point. 4325 */ 4326 DB_SHOW_COMMAND(mount, db_show_mount) 4327 { 4328 struct mount *mp; 4329 struct vfsopt *opt; 4330 struct statfs *sp; 4331 struct vnode *vp; 4332 char buf[512]; 4333 uint64_t mflags; 4334 u_int flags; 4335 4336 if (!have_addr) { 4337 /* No address given, print short info about all mount points. */ 4338 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4339 db_printf("%p %s on %s (%s)\n", mp, 4340 mp->mnt_stat.f_mntfromname, 4341 mp->mnt_stat.f_mntonname, 4342 mp->mnt_stat.f_fstypename); 4343 if (db_pager_quit) 4344 break; 4345 } 4346 db_printf("\nMore info: show mount <addr>\n"); 4347 return; 4348 } 4349 4350 mp = (struct mount *)addr; 4351 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4352 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4353 4354 buf[0] = '\0'; 4355 mflags = mp->mnt_flag; 4356 #define MNT_FLAG(flag) do { \ 4357 if (mflags & (flag)) { \ 4358 if (buf[0] != '\0') \ 4359 strlcat(buf, ", ", sizeof(buf)); \ 4360 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4361 mflags &= ~(flag); \ 4362 } \ 4363 } while (0) 4364 MNT_FLAG(MNT_RDONLY); 4365 MNT_FLAG(MNT_SYNCHRONOUS); 4366 MNT_FLAG(MNT_NOEXEC); 4367 MNT_FLAG(MNT_NOSUID); 4368 MNT_FLAG(MNT_NFS4ACLS); 4369 MNT_FLAG(MNT_UNION); 4370 MNT_FLAG(MNT_ASYNC); 4371 MNT_FLAG(MNT_SUIDDIR); 4372 MNT_FLAG(MNT_SOFTDEP); 4373 MNT_FLAG(MNT_NOSYMFOLLOW); 4374 MNT_FLAG(MNT_GJOURNAL); 4375 MNT_FLAG(MNT_MULTILABEL); 4376 MNT_FLAG(MNT_ACLS); 4377 MNT_FLAG(MNT_NOATIME); 4378 MNT_FLAG(MNT_NOCLUSTERR); 4379 MNT_FLAG(MNT_NOCLUSTERW); 4380 MNT_FLAG(MNT_SUJ); 4381 MNT_FLAG(MNT_EXRDONLY); 4382 MNT_FLAG(MNT_EXPORTED); 4383 MNT_FLAG(MNT_DEFEXPORTED); 4384 MNT_FLAG(MNT_EXPORTANON); 4385 MNT_FLAG(MNT_EXKERB); 4386 MNT_FLAG(MNT_EXPUBLIC); 4387 MNT_FLAG(MNT_LOCAL); 4388 MNT_FLAG(MNT_QUOTA); 4389 MNT_FLAG(MNT_ROOTFS); 4390 MNT_FLAG(MNT_USER); 4391 MNT_FLAG(MNT_IGNORE); 4392 MNT_FLAG(MNT_UPDATE); 4393 MNT_FLAG(MNT_DELEXPORT); 4394 MNT_FLAG(MNT_RELOAD); 4395 MNT_FLAG(MNT_FORCE); 4396 MNT_FLAG(MNT_SNAPSHOT); 4397 MNT_FLAG(MNT_BYFSID); 4398 #undef MNT_FLAG 4399 if (mflags != 0) { 4400 if (buf[0] != '\0') 4401 strlcat(buf, ", ", sizeof(buf)); 4402 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4403 "0x%016jx", mflags); 4404 } 4405 db_printf(" mnt_flag = %s\n", buf); 4406 4407 buf[0] = '\0'; 4408 flags = mp->mnt_kern_flag; 4409 #define MNT_KERN_FLAG(flag) do { \ 4410 if (flags & (flag)) { \ 4411 if (buf[0] != '\0') \ 4412 strlcat(buf, ", ", sizeof(buf)); \ 4413 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4414 flags &= ~(flag); \ 4415 } \ 4416 } while (0) 4417 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4418 MNT_KERN_FLAG(MNTK_ASYNC); 4419 MNT_KERN_FLAG(MNTK_SOFTDEP); 4420 MNT_KERN_FLAG(MNTK_NOMSYNC); 4421 MNT_KERN_FLAG(MNTK_DRAINING); 4422 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4423 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4424 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4425 MNT_KERN_FLAG(MNTK_NO_IOPF); 4426 MNT_KERN_FLAG(MNTK_RECURSE); 4427 MNT_KERN_FLAG(MNTK_UPPER_WAITER); 4428 MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); 4429 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4430 MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); 4431 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4432 MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); 4433 MNT_KERN_FLAG(MNTK_NOASYNC); 4434 MNT_KERN_FLAG(MNTK_UNMOUNT); 4435 MNT_KERN_FLAG(MNTK_MWAIT); 4436 MNT_KERN_FLAG(MNTK_SUSPEND); 4437 MNT_KERN_FLAG(MNTK_SUSPEND2); 4438 MNT_KERN_FLAG(MNTK_SUSPENDED); 4439 MNT_KERN_FLAG(MNTK_NULL_NOCACHE); 4440 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4441 #undef MNT_KERN_FLAG 4442 if (flags != 0) { 4443 if (buf[0] != '\0') 4444 strlcat(buf, ", ", sizeof(buf)); 4445 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4446 "0x%08x", flags); 4447 } 4448 db_printf(" mnt_kern_flag = %s\n", buf); 4449 4450 db_printf(" mnt_opt = "); 4451 opt = TAILQ_FIRST(mp->mnt_opt); 4452 if (opt != NULL) { 4453 db_printf("%s", opt->name); 4454 opt = TAILQ_NEXT(opt, link); 4455 while (opt != NULL) { 4456 db_printf(", %s", opt->name); 4457 opt = TAILQ_NEXT(opt, link); 4458 } 4459 } 4460 db_printf("\n"); 4461 4462 sp = &mp->mnt_stat; 4463 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4464 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4465 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4466 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4467 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4468 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4469 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4470 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4471 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4472 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4473 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4474 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4475 4476 db_printf(" mnt_cred = { uid=%u ruid=%u", 4477 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4478 if (jailed(mp->mnt_cred)) 4479 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4480 db_printf(" }\n"); 4481 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4482 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4483 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4484 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4485 db_printf(" mnt_lazyvnodelistsize = %d\n", 4486 mp->mnt_lazyvnodelistsize); 4487 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4488 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4489 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4490 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4491 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4492 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4493 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4494 db_printf(" mnt_secondary_accwrites = %d\n", 4495 mp->mnt_secondary_accwrites); 4496 db_printf(" mnt_gjprovider = %s\n", 4497 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4498 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4499 4500 db_printf("\n\nList of active vnodes\n"); 4501 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4502 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4503 vn_printf(vp, "vnode "); 4504 if (db_pager_quit) 4505 break; 4506 } 4507 } 4508 db_printf("\n\nList of inactive vnodes\n"); 4509 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4510 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4511 vn_printf(vp, "vnode "); 4512 if (db_pager_quit) 4513 break; 4514 } 4515 } 4516 } 4517 #endif /* DDB */ 4518 4519 /* 4520 * Fill in a struct xvfsconf based on a struct vfsconf. 4521 */ 4522 static int 4523 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4524 { 4525 struct xvfsconf xvfsp; 4526 4527 bzero(&xvfsp, sizeof(xvfsp)); 4528 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4529 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4530 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4531 xvfsp.vfc_flags = vfsp->vfc_flags; 4532 /* 4533 * These are unused in userland, we keep them 4534 * to not break binary compatibility. 4535 */ 4536 xvfsp.vfc_vfsops = NULL; 4537 xvfsp.vfc_next = NULL; 4538 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4539 } 4540 4541 #ifdef COMPAT_FREEBSD32 4542 struct xvfsconf32 { 4543 uint32_t vfc_vfsops; 4544 char vfc_name[MFSNAMELEN]; 4545 int32_t vfc_typenum; 4546 int32_t vfc_refcount; 4547 int32_t vfc_flags; 4548 uint32_t vfc_next; 4549 }; 4550 4551 static int 4552 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4553 { 4554 struct xvfsconf32 xvfsp; 4555 4556 bzero(&xvfsp, sizeof(xvfsp)); 4557 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4558 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4559 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4560 xvfsp.vfc_flags = vfsp->vfc_flags; 4561 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4562 } 4563 #endif 4564 4565 /* 4566 * Top level filesystem related information gathering. 4567 */ 4568 static int 4569 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4570 { 4571 struct vfsconf *vfsp; 4572 int error; 4573 4574 error = 0; 4575 vfsconf_slock(); 4576 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4577 #ifdef COMPAT_FREEBSD32 4578 if (req->flags & SCTL_MASK32) 4579 error = vfsconf2x32(req, vfsp); 4580 else 4581 #endif 4582 error = vfsconf2x(req, vfsp); 4583 if (error) 4584 break; 4585 } 4586 vfsconf_sunlock(); 4587 return (error); 4588 } 4589 4590 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4591 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4592 "S,xvfsconf", "List of all configured filesystems"); 4593 4594 #ifndef BURN_BRIDGES 4595 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4596 4597 static int 4598 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4599 { 4600 int *name = (int *)arg1 - 1; /* XXX */ 4601 u_int namelen = arg2 + 1; /* XXX */ 4602 struct vfsconf *vfsp; 4603 4604 log(LOG_WARNING, "userland calling deprecated sysctl, " 4605 "please rebuild world\n"); 4606 4607 #if 1 || defined(COMPAT_PRELITE2) 4608 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4609 if (namelen == 1) 4610 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4611 #endif 4612 4613 switch (name[1]) { 4614 case VFS_MAXTYPENUM: 4615 if (namelen != 2) 4616 return (ENOTDIR); 4617 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4618 case VFS_CONF: 4619 if (namelen != 3) 4620 return (ENOTDIR); /* overloaded */ 4621 vfsconf_slock(); 4622 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4623 if (vfsp->vfc_typenum == name[2]) 4624 break; 4625 } 4626 vfsconf_sunlock(); 4627 if (vfsp == NULL) 4628 return (EOPNOTSUPP); 4629 #ifdef COMPAT_FREEBSD32 4630 if (req->flags & SCTL_MASK32) 4631 return (vfsconf2x32(req, vfsp)); 4632 else 4633 #endif 4634 return (vfsconf2x(req, vfsp)); 4635 } 4636 return (EOPNOTSUPP); 4637 } 4638 4639 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 4640 CTLFLAG_MPSAFE, vfs_sysctl, 4641 "Generic filesystem"); 4642 4643 #if 1 || defined(COMPAT_PRELITE2) 4644 4645 static int 4646 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 4647 { 4648 int error; 4649 struct vfsconf *vfsp; 4650 struct ovfsconf ovfs; 4651 4652 vfsconf_slock(); 4653 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4654 bzero(&ovfs, sizeof(ovfs)); 4655 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 4656 strcpy(ovfs.vfc_name, vfsp->vfc_name); 4657 ovfs.vfc_index = vfsp->vfc_typenum; 4658 ovfs.vfc_refcount = vfsp->vfc_refcount; 4659 ovfs.vfc_flags = vfsp->vfc_flags; 4660 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 4661 if (error != 0) { 4662 vfsconf_sunlock(); 4663 return (error); 4664 } 4665 } 4666 vfsconf_sunlock(); 4667 return (0); 4668 } 4669 4670 #endif /* 1 || COMPAT_PRELITE2 */ 4671 #endif /* !BURN_BRIDGES */ 4672 4673 #define KINFO_VNODESLOP 10 4674 #ifdef notyet 4675 /* 4676 * Dump vnode list (via sysctl). 4677 */ 4678 /* ARGSUSED */ 4679 static int 4680 sysctl_vnode(SYSCTL_HANDLER_ARGS) 4681 { 4682 struct xvnode *xvn; 4683 struct mount *mp; 4684 struct vnode *vp; 4685 int error, len, n; 4686 4687 /* 4688 * Stale numvnodes access is not fatal here. 4689 */ 4690 req->lock = 0; 4691 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 4692 if (!req->oldptr) 4693 /* Make an estimate */ 4694 return (SYSCTL_OUT(req, 0, len)); 4695 4696 error = sysctl_wire_old_buffer(req, 0); 4697 if (error != 0) 4698 return (error); 4699 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 4700 n = 0; 4701 mtx_lock(&mountlist_mtx); 4702 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4703 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 4704 continue; 4705 MNT_ILOCK(mp); 4706 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4707 if (n == len) 4708 break; 4709 vref(vp); 4710 xvn[n].xv_size = sizeof *xvn; 4711 xvn[n].xv_vnode = vp; 4712 xvn[n].xv_id = 0; /* XXX compat */ 4713 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 4714 XV_COPY(usecount); 4715 XV_COPY(writecount); 4716 XV_COPY(holdcnt); 4717 XV_COPY(mount); 4718 XV_COPY(numoutput); 4719 XV_COPY(type); 4720 #undef XV_COPY 4721 xvn[n].xv_flag = vp->v_vflag; 4722 4723 switch (vp->v_type) { 4724 case VREG: 4725 case VDIR: 4726 case VLNK: 4727 break; 4728 case VBLK: 4729 case VCHR: 4730 if (vp->v_rdev == NULL) { 4731 vrele(vp); 4732 continue; 4733 } 4734 xvn[n].xv_dev = dev2udev(vp->v_rdev); 4735 break; 4736 case VSOCK: 4737 xvn[n].xv_socket = vp->v_socket; 4738 break; 4739 case VFIFO: 4740 xvn[n].xv_fifo = vp->v_fifoinfo; 4741 break; 4742 case VNON: 4743 case VBAD: 4744 default: 4745 /* shouldn't happen? */ 4746 vrele(vp); 4747 continue; 4748 } 4749 vrele(vp); 4750 ++n; 4751 } 4752 MNT_IUNLOCK(mp); 4753 mtx_lock(&mountlist_mtx); 4754 vfs_unbusy(mp); 4755 if (n == len) 4756 break; 4757 } 4758 mtx_unlock(&mountlist_mtx); 4759 4760 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 4761 free(xvn, M_TEMP); 4762 return (error); 4763 } 4764 4765 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 4766 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 4767 ""); 4768 #endif 4769 4770 static void 4771 unmount_or_warn(struct mount *mp) 4772 { 4773 int error; 4774 4775 error = dounmount(mp, MNT_FORCE, curthread); 4776 if (error != 0) { 4777 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4778 if (error == EBUSY) 4779 printf("BUSY)\n"); 4780 else 4781 printf("%d)\n", error); 4782 } 4783 } 4784 4785 /* 4786 * Unmount all filesystems. The list is traversed in reverse order 4787 * of mounting to avoid dependencies. 4788 */ 4789 void 4790 vfs_unmountall(void) 4791 { 4792 struct mount *mp, *tmp; 4793 4794 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4795 4796 /* 4797 * Since this only runs when rebooting, it is not interlocked. 4798 */ 4799 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 4800 vfs_ref(mp); 4801 4802 /* 4803 * Forcibly unmounting "/dev" before "/" would prevent clean 4804 * unmount of the latter. 4805 */ 4806 if (mp == rootdevmp) 4807 continue; 4808 4809 unmount_or_warn(mp); 4810 } 4811 4812 if (rootdevmp != NULL) 4813 unmount_or_warn(rootdevmp); 4814 } 4815 4816 static void 4817 vfs_deferred_inactive(struct vnode *vp, int lkflags) 4818 { 4819 4820 ASSERT_VI_LOCKED(vp, __func__); 4821 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); 4822 if ((vp->v_iflag & VI_OWEINACT) == 0) { 4823 vdropl(vp); 4824 return; 4825 } 4826 if (vn_lock(vp, lkflags) == 0) { 4827 VI_LOCK(vp); 4828 vinactive(vp); 4829 VOP_UNLOCK(vp); 4830 vdropl(vp); 4831 return; 4832 } 4833 vdefer_inactive_unlocked(vp); 4834 } 4835 4836 static int 4837 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 4838 { 4839 4840 return (vp->v_iflag & VI_DEFINACT); 4841 } 4842 4843 static void __noinline 4844 vfs_periodic_inactive(struct mount *mp, int flags) 4845 { 4846 struct vnode *vp, *mvp; 4847 int lkflags; 4848 4849 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4850 if (flags != MNT_WAIT) 4851 lkflags |= LK_NOWAIT; 4852 4853 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 4854 if ((vp->v_iflag & VI_DEFINACT) == 0) { 4855 VI_UNLOCK(vp); 4856 continue; 4857 } 4858 vp->v_iflag &= ~VI_DEFINACT; 4859 vfs_deferred_inactive(vp, lkflags); 4860 } 4861 } 4862 4863 static inline bool 4864 vfs_want_msync(struct vnode *vp) 4865 { 4866 struct vm_object *obj; 4867 4868 /* 4869 * This test may be performed without any locks held. 4870 * We rely on vm_object's type stability. 4871 */ 4872 if (vp->v_vflag & VV_NOSYNC) 4873 return (false); 4874 obj = vp->v_object; 4875 return (obj != NULL && vm_object_mightbedirty(obj)); 4876 } 4877 4878 static int 4879 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 4880 { 4881 4882 if (vp->v_vflag & VV_NOSYNC) 4883 return (false); 4884 if (vp->v_iflag & VI_DEFINACT) 4885 return (true); 4886 return (vfs_want_msync(vp)); 4887 } 4888 4889 static void __noinline 4890 vfs_periodic_msync_inactive(struct mount *mp, int flags) 4891 { 4892 struct vnode *vp, *mvp; 4893 struct vm_object *obj; 4894 int lkflags, objflags; 4895 bool seen_defer; 4896 4897 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4898 if (flags != MNT_WAIT) { 4899 lkflags |= LK_NOWAIT; 4900 objflags = OBJPC_NOSYNC; 4901 } else { 4902 objflags = OBJPC_SYNC; 4903 } 4904 4905 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 4906 seen_defer = false; 4907 if (vp->v_iflag & VI_DEFINACT) { 4908 vp->v_iflag &= ~VI_DEFINACT; 4909 seen_defer = true; 4910 } 4911 if (!vfs_want_msync(vp)) { 4912 if (seen_defer) 4913 vfs_deferred_inactive(vp, lkflags); 4914 else 4915 VI_UNLOCK(vp); 4916 continue; 4917 } 4918 if (vget(vp, lkflags) == 0) { 4919 obj = vp->v_object; 4920 if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { 4921 VM_OBJECT_WLOCK(obj); 4922 vm_object_page_clean(obj, 0, 0, objflags); 4923 VM_OBJECT_WUNLOCK(obj); 4924 } 4925 vput(vp); 4926 if (seen_defer) 4927 vdrop(vp); 4928 } else { 4929 if (seen_defer) 4930 vdefer_inactive_unlocked(vp); 4931 } 4932 } 4933 } 4934 4935 void 4936 vfs_periodic(struct mount *mp, int flags) 4937 { 4938 4939 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 4940 4941 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 4942 vfs_periodic_inactive(mp, flags); 4943 else 4944 vfs_periodic_msync_inactive(mp, flags); 4945 } 4946 4947 static void 4948 destroy_vpollinfo_free(struct vpollinfo *vi) 4949 { 4950 4951 knlist_destroy(&vi->vpi_selinfo.si_note); 4952 mtx_destroy(&vi->vpi_lock); 4953 free(vi, M_VNODEPOLL); 4954 } 4955 4956 static void 4957 destroy_vpollinfo(struct vpollinfo *vi) 4958 { 4959 4960 knlist_clear(&vi->vpi_selinfo.si_note, 1); 4961 seldrain(&vi->vpi_selinfo); 4962 destroy_vpollinfo_free(vi); 4963 } 4964 4965 /* 4966 * Initialize per-vnode helper structure to hold poll-related state. 4967 */ 4968 void 4969 v_addpollinfo(struct vnode *vp) 4970 { 4971 struct vpollinfo *vi; 4972 4973 if (vp->v_pollinfo != NULL) 4974 return; 4975 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); 4976 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 4977 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 4978 vfs_knlunlock, vfs_knl_assert_lock); 4979 VI_LOCK(vp); 4980 if (vp->v_pollinfo != NULL) { 4981 VI_UNLOCK(vp); 4982 destroy_vpollinfo_free(vi); 4983 return; 4984 } 4985 vp->v_pollinfo = vi; 4986 VI_UNLOCK(vp); 4987 } 4988 4989 /* 4990 * Record a process's interest in events which might happen to 4991 * a vnode. Because poll uses the historic select-style interface 4992 * internally, this routine serves as both the ``check for any 4993 * pending events'' and the ``record my interest in future events'' 4994 * functions. (These are done together, while the lock is held, 4995 * to avoid race conditions.) 4996 */ 4997 int 4998 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 4999 { 5000 5001 v_addpollinfo(vp); 5002 mtx_lock(&vp->v_pollinfo->vpi_lock); 5003 if (vp->v_pollinfo->vpi_revents & events) { 5004 /* 5005 * This leaves events we are not interested 5006 * in available for the other process which 5007 * which presumably had requested them 5008 * (otherwise they would never have been 5009 * recorded). 5010 */ 5011 events &= vp->v_pollinfo->vpi_revents; 5012 vp->v_pollinfo->vpi_revents &= ~events; 5013 5014 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5015 return (events); 5016 } 5017 vp->v_pollinfo->vpi_events |= events; 5018 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 5019 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5020 return (0); 5021 } 5022 5023 /* 5024 * Routine to create and manage a filesystem syncer vnode. 5025 */ 5026 #define sync_close ((int (*)(struct vop_close_args *))nullop) 5027 static int sync_fsync(struct vop_fsync_args *); 5028 static int sync_inactive(struct vop_inactive_args *); 5029 static int sync_reclaim(struct vop_reclaim_args *); 5030 5031 static struct vop_vector sync_vnodeops = { 5032 .vop_bypass = VOP_EOPNOTSUPP, 5033 .vop_close = sync_close, /* close */ 5034 .vop_fsync = sync_fsync, /* fsync */ 5035 .vop_inactive = sync_inactive, /* inactive */ 5036 .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ 5037 .vop_reclaim = sync_reclaim, /* reclaim */ 5038 .vop_lock1 = vop_stdlock, /* lock */ 5039 .vop_unlock = vop_stdunlock, /* unlock */ 5040 .vop_islocked = vop_stdislocked, /* islocked */ 5041 }; 5042 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 5043 5044 /* 5045 * Create a new filesystem syncer vnode for the specified mount point. 5046 */ 5047 void 5048 vfs_allocate_syncvnode(struct mount *mp) 5049 { 5050 struct vnode *vp; 5051 struct bufobj *bo; 5052 static long start, incr, next; 5053 int error; 5054 5055 /* Allocate a new vnode */ 5056 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 5057 if (error != 0) 5058 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 5059 vp->v_type = VNON; 5060 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5061 vp->v_vflag |= VV_FORCEINSMQ; 5062 error = insmntque1(vp, mp); 5063 if (error != 0) 5064 panic("vfs_allocate_syncvnode: insmntque() failed"); 5065 vp->v_vflag &= ~VV_FORCEINSMQ; 5066 VOP_UNLOCK(vp); 5067 /* 5068 * Place the vnode onto the syncer worklist. We attempt to 5069 * scatter them about on the list so that they will go off 5070 * at evenly distributed times even if all the filesystems 5071 * are mounted at once. 5072 */ 5073 next += incr; 5074 if (next == 0 || next > syncer_maxdelay) { 5075 start /= 2; 5076 incr /= 2; 5077 if (start == 0) { 5078 start = syncer_maxdelay / 2; 5079 incr = syncer_maxdelay; 5080 } 5081 next = start; 5082 } 5083 bo = &vp->v_bufobj; 5084 BO_LOCK(bo); 5085 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5086 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5087 mtx_lock(&sync_mtx); 5088 sync_vnode_count++; 5089 if (mp->mnt_syncer == NULL) { 5090 mp->mnt_syncer = vp; 5091 vp = NULL; 5092 } 5093 mtx_unlock(&sync_mtx); 5094 BO_UNLOCK(bo); 5095 if (vp != NULL) { 5096 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5097 vgone(vp); 5098 vput(vp); 5099 } 5100 } 5101 5102 void 5103 vfs_deallocate_syncvnode(struct mount *mp) 5104 { 5105 struct vnode *vp; 5106 5107 mtx_lock(&sync_mtx); 5108 vp = mp->mnt_syncer; 5109 if (vp != NULL) 5110 mp->mnt_syncer = NULL; 5111 mtx_unlock(&sync_mtx); 5112 if (vp != NULL) 5113 vrele(vp); 5114 } 5115 5116 /* 5117 * Do a lazy sync of the filesystem. 5118 */ 5119 static int 5120 sync_fsync(struct vop_fsync_args *ap) 5121 { 5122 struct vnode *syncvp = ap->a_vp; 5123 struct mount *mp = syncvp->v_mount; 5124 int error, save; 5125 struct bufobj *bo; 5126 5127 /* 5128 * We only need to do something if this is a lazy evaluation. 5129 */ 5130 if (ap->a_waitfor != MNT_LAZY) 5131 return (0); 5132 5133 /* 5134 * Move ourselves to the back of the sync list. 5135 */ 5136 bo = &syncvp->v_bufobj; 5137 BO_LOCK(bo); 5138 vn_syncer_add_to_worklist(bo, syncdelay); 5139 BO_UNLOCK(bo); 5140 5141 /* 5142 * Walk the list of vnodes pushing all that are dirty and 5143 * not already on the sync list. 5144 */ 5145 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5146 return (0); 5147 VOP_UNLOCK(syncvp); 5148 save = curthread_pflags_set(TDP_SYNCIO); 5149 /* 5150 * The filesystem at hand may be idle with free vnodes stored in the 5151 * batch. Return them instead of letting them stay there indefinitely. 5152 */ 5153 vfs_periodic(mp, MNT_NOWAIT); 5154 error = VFS_SYNC(mp, MNT_LAZY); 5155 curthread_pflags_restore(save); 5156 vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); 5157 vfs_unbusy(mp); 5158 return (error); 5159 } 5160 5161 /* 5162 * The syncer vnode is no referenced. 5163 */ 5164 static int 5165 sync_inactive(struct vop_inactive_args *ap) 5166 { 5167 5168 vgone(ap->a_vp); 5169 return (0); 5170 } 5171 5172 /* 5173 * The syncer vnode is no longer needed and is being decommissioned. 5174 * 5175 * Modifications to the worklist must be protected by sync_mtx. 5176 */ 5177 static int 5178 sync_reclaim(struct vop_reclaim_args *ap) 5179 { 5180 struct vnode *vp = ap->a_vp; 5181 struct bufobj *bo; 5182 5183 bo = &vp->v_bufobj; 5184 BO_LOCK(bo); 5185 mtx_lock(&sync_mtx); 5186 if (vp->v_mount->mnt_syncer == vp) 5187 vp->v_mount->mnt_syncer = NULL; 5188 if (bo->bo_flag & BO_ONWORKLST) { 5189 LIST_REMOVE(bo, bo_synclist); 5190 syncer_worklist_len--; 5191 sync_vnode_count--; 5192 bo->bo_flag &= ~BO_ONWORKLST; 5193 } 5194 mtx_unlock(&sync_mtx); 5195 BO_UNLOCK(bo); 5196 5197 return (0); 5198 } 5199 5200 int 5201 vn_need_pageq_flush(struct vnode *vp) 5202 { 5203 struct vm_object *obj; 5204 5205 obj = vp->v_object; 5206 return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5207 vm_object_mightbedirty(obj)); 5208 } 5209 5210 /* 5211 * Check if vnode represents a disk device 5212 */ 5213 bool 5214 vn_isdisk_error(struct vnode *vp, int *errp) 5215 { 5216 int error; 5217 5218 if (vp->v_type != VCHR) { 5219 error = ENOTBLK; 5220 goto out; 5221 } 5222 error = 0; 5223 dev_lock(); 5224 if (vp->v_rdev == NULL) 5225 error = ENXIO; 5226 else if (vp->v_rdev->si_devsw == NULL) 5227 error = ENXIO; 5228 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5229 error = ENOTBLK; 5230 dev_unlock(); 5231 out: 5232 *errp = error; 5233 return (error == 0); 5234 } 5235 5236 bool 5237 vn_isdisk(struct vnode *vp) 5238 { 5239 int error; 5240 5241 return (vn_isdisk_error(vp, &error)); 5242 } 5243 5244 /* 5245 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5246 * the comment above cache_fplookup for details. 5247 */ 5248 int 5249 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5250 { 5251 int error; 5252 5253 VFS_SMR_ASSERT_ENTERED(); 5254 5255 /* Check the owner. */ 5256 if (cred->cr_uid == file_uid) { 5257 if (file_mode & S_IXUSR) 5258 return (0); 5259 goto out_error; 5260 } 5261 5262 /* Otherwise, check the groups (first match) */ 5263 if (groupmember(file_gid, cred)) { 5264 if (file_mode & S_IXGRP) 5265 return (0); 5266 goto out_error; 5267 } 5268 5269 /* Otherwise, check everyone else. */ 5270 if (file_mode & S_IXOTH) 5271 return (0); 5272 out_error: 5273 /* 5274 * Permission check failed, but it is possible denial will get overwritten 5275 * (e.g., when root is traversing through a 700 directory owned by someone 5276 * else). 5277 * 5278 * vaccess() calls priv_check_cred which in turn can descent into MAC 5279 * modules overriding this result. It's quite unclear what semantics 5280 * are allowed for them to operate, thus for safety we don't call them 5281 * from within the SMR section. This also means if any such modules 5282 * are present, we have to let the regular lookup decide. 5283 */ 5284 error = priv_check_cred_vfs_lookup_nomac(cred); 5285 switch (error) { 5286 case 0: 5287 return (0); 5288 case EAGAIN: 5289 /* 5290 * MAC modules present. 5291 */ 5292 return (EAGAIN); 5293 case EPERM: 5294 return (EACCES); 5295 default: 5296 return (error); 5297 } 5298 } 5299 5300 /* 5301 * Common filesystem object access control check routine. Accepts a 5302 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5303 * Returns 0 on success, or an errno on failure. 5304 */ 5305 int 5306 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5307 accmode_t accmode, struct ucred *cred) 5308 { 5309 accmode_t dac_granted; 5310 accmode_t priv_granted; 5311 5312 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5313 ("invalid bit in accmode")); 5314 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5315 ("VAPPEND without VWRITE")); 5316 5317 /* 5318 * Look for a normal, non-privileged way to access the file/directory 5319 * as requested. If it exists, go with that. 5320 */ 5321 5322 dac_granted = 0; 5323 5324 /* Check the owner. */ 5325 if (cred->cr_uid == file_uid) { 5326 dac_granted |= VADMIN; 5327 if (file_mode & S_IXUSR) 5328 dac_granted |= VEXEC; 5329 if (file_mode & S_IRUSR) 5330 dac_granted |= VREAD; 5331 if (file_mode & S_IWUSR) 5332 dac_granted |= (VWRITE | VAPPEND); 5333 5334 if ((accmode & dac_granted) == accmode) 5335 return (0); 5336 5337 goto privcheck; 5338 } 5339 5340 /* Otherwise, check the groups (first match) */ 5341 if (groupmember(file_gid, cred)) { 5342 if (file_mode & S_IXGRP) 5343 dac_granted |= VEXEC; 5344 if (file_mode & S_IRGRP) 5345 dac_granted |= VREAD; 5346 if (file_mode & S_IWGRP) 5347 dac_granted |= (VWRITE | VAPPEND); 5348 5349 if ((accmode & dac_granted) == accmode) 5350 return (0); 5351 5352 goto privcheck; 5353 } 5354 5355 /* Otherwise, check everyone else. */ 5356 if (file_mode & S_IXOTH) 5357 dac_granted |= VEXEC; 5358 if (file_mode & S_IROTH) 5359 dac_granted |= VREAD; 5360 if (file_mode & S_IWOTH) 5361 dac_granted |= (VWRITE | VAPPEND); 5362 if ((accmode & dac_granted) == accmode) 5363 return (0); 5364 5365 privcheck: 5366 /* 5367 * Build a privilege mask to determine if the set of privileges 5368 * satisfies the requirements when combined with the granted mask 5369 * from above. For each privilege, if the privilege is required, 5370 * bitwise or the request type onto the priv_granted mask. 5371 */ 5372 priv_granted = 0; 5373 5374 if (type == VDIR) { 5375 /* 5376 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5377 * requests, instead of PRIV_VFS_EXEC. 5378 */ 5379 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5380 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5381 priv_granted |= VEXEC; 5382 } else { 5383 /* 5384 * Ensure that at least one execute bit is on. Otherwise, 5385 * a privileged user will always succeed, and we don't want 5386 * this to happen unless the file really is executable. 5387 */ 5388 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5389 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5390 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5391 priv_granted |= VEXEC; 5392 } 5393 5394 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5395 !priv_check_cred(cred, PRIV_VFS_READ)) 5396 priv_granted |= VREAD; 5397 5398 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5399 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5400 priv_granted |= (VWRITE | VAPPEND); 5401 5402 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5403 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5404 priv_granted |= VADMIN; 5405 5406 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5407 return (0); 5408 } 5409 5410 return ((accmode & VADMIN) ? EPERM : EACCES); 5411 } 5412 5413 /* 5414 * Credential check based on process requesting service, and per-attribute 5415 * permissions. 5416 */ 5417 int 5418 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5419 struct thread *td, accmode_t accmode) 5420 { 5421 5422 /* 5423 * Kernel-invoked always succeeds. 5424 */ 5425 if (cred == NOCRED) 5426 return (0); 5427 5428 /* 5429 * Do not allow privileged processes in jail to directly manipulate 5430 * system attributes. 5431 */ 5432 switch (attrnamespace) { 5433 case EXTATTR_NAMESPACE_SYSTEM: 5434 /* Potentially should be: return (EPERM); */ 5435 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5436 case EXTATTR_NAMESPACE_USER: 5437 return (VOP_ACCESS(vp, accmode, cred, td)); 5438 default: 5439 return (EPERM); 5440 } 5441 } 5442 5443 #ifdef DEBUG_VFS_LOCKS 5444 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5445 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5446 "Drop into debugger on lock violation"); 5447 5448 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5449 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5450 0, "Check for interlock across VOPs"); 5451 5452 int vfs_badlock_print = 1; /* Print lock violations. */ 5453 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5454 0, "Print lock violations"); 5455 5456 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5457 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5458 0, "Print vnode details on lock violations"); 5459 5460 #ifdef KDB 5461 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5462 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5463 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5464 #endif 5465 5466 static void 5467 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5468 { 5469 5470 #ifdef KDB 5471 if (vfs_badlock_backtrace) 5472 kdb_backtrace(); 5473 #endif 5474 if (vfs_badlock_vnode) 5475 vn_printf(vp, "vnode "); 5476 if (vfs_badlock_print) 5477 printf("%s: %p %s\n", str, (void *)vp, msg); 5478 if (vfs_badlock_ddb) 5479 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5480 } 5481 5482 void 5483 assert_vi_locked(struct vnode *vp, const char *str) 5484 { 5485 5486 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5487 vfs_badlock("interlock is not locked but should be", str, vp); 5488 } 5489 5490 void 5491 assert_vi_unlocked(struct vnode *vp, const char *str) 5492 { 5493 5494 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5495 vfs_badlock("interlock is locked but should not be", str, vp); 5496 } 5497 5498 void 5499 assert_vop_locked(struct vnode *vp, const char *str) 5500 { 5501 int locked; 5502 5503 if (KERNEL_PANICKED() || vp == NULL) 5504 return; 5505 5506 locked = VOP_ISLOCKED(vp); 5507 if (locked == 0 || locked == LK_EXCLOTHER) 5508 vfs_badlock("is not locked but should be", str, vp); 5509 } 5510 5511 void 5512 assert_vop_unlocked(struct vnode *vp, const char *str) 5513 { 5514 if (KERNEL_PANICKED() || vp == NULL) 5515 return; 5516 5517 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5518 vfs_badlock("is locked but should not be", str, vp); 5519 } 5520 5521 void 5522 assert_vop_elocked(struct vnode *vp, const char *str) 5523 { 5524 if (KERNEL_PANICKED() || vp == NULL) 5525 return; 5526 5527 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5528 vfs_badlock("is not exclusive locked but should be", str, vp); 5529 } 5530 #endif /* DEBUG_VFS_LOCKS */ 5531 5532 void 5533 vop_rename_fail(struct vop_rename_args *ap) 5534 { 5535 5536 if (ap->a_tvp != NULL) 5537 vput(ap->a_tvp); 5538 if (ap->a_tdvp == ap->a_tvp) 5539 vrele(ap->a_tdvp); 5540 else 5541 vput(ap->a_tdvp); 5542 vrele(ap->a_fdvp); 5543 vrele(ap->a_fvp); 5544 } 5545 5546 void 5547 vop_rename_pre(void *ap) 5548 { 5549 struct vop_rename_args *a = ap; 5550 5551 #ifdef DEBUG_VFS_LOCKS 5552 if (a->a_tvp) 5553 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5554 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5555 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5556 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5557 5558 /* Check the source (from). */ 5559 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5560 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5561 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5562 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5563 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5564 5565 /* Check the target. */ 5566 if (a->a_tvp) 5567 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5568 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5569 #endif 5570 /* 5571 * It may be tempting to add vn_seqc_write_begin/end calls here and 5572 * in vop_rename_post but that's not going to work out since some 5573 * filesystems relookup vnodes mid-rename. This is probably a bug. 5574 * 5575 * For now filesystems are expected to do the relevant calls after they 5576 * decide what vnodes to operate on. 5577 */ 5578 if (a->a_tdvp != a->a_fdvp) 5579 vhold(a->a_fdvp); 5580 if (a->a_tvp != a->a_fvp) 5581 vhold(a->a_fvp); 5582 vhold(a->a_tdvp); 5583 if (a->a_tvp) 5584 vhold(a->a_tvp); 5585 } 5586 5587 #ifdef DEBUG_VFS_LOCKS 5588 void 5589 vop_fplookup_vexec_debugpre(void *ap __unused) 5590 { 5591 5592 VFS_SMR_ASSERT_ENTERED(); 5593 } 5594 5595 void 5596 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) 5597 { 5598 5599 VFS_SMR_ASSERT_ENTERED(); 5600 } 5601 5602 void 5603 vop_fplookup_symlink_debugpre(void *ap __unused) 5604 { 5605 5606 VFS_SMR_ASSERT_ENTERED(); 5607 } 5608 5609 void 5610 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) 5611 { 5612 5613 VFS_SMR_ASSERT_ENTERED(); 5614 } 5615 5616 static void 5617 vop_fsync_debugprepost(struct vnode *vp, const char *name) 5618 { 5619 if (vp->v_type == VCHR) 5620 ; 5621 else if (MNT_EXTENDED_SHARED(vp->v_mount)) 5622 ASSERT_VOP_LOCKED(vp, name); 5623 else 5624 ASSERT_VOP_ELOCKED(vp, name); 5625 } 5626 5627 void 5628 vop_fsync_debugpre(void *a) 5629 { 5630 struct vop_fsync_args *ap; 5631 5632 ap = a; 5633 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5634 } 5635 5636 void 5637 vop_fsync_debugpost(void *a, int rc __unused) 5638 { 5639 struct vop_fsync_args *ap; 5640 5641 ap = a; 5642 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5643 } 5644 5645 void 5646 vop_fdatasync_debugpre(void *a) 5647 { 5648 struct vop_fdatasync_args *ap; 5649 5650 ap = a; 5651 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5652 } 5653 5654 void 5655 vop_fdatasync_debugpost(void *a, int rc __unused) 5656 { 5657 struct vop_fdatasync_args *ap; 5658 5659 ap = a; 5660 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5661 } 5662 5663 void 5664 vop_strategy_debugpre(void *ap) 5665 { 5666 struct vop_strategy_args *a; 5667 struct buf *bp; 5668 5669 a = ap; 5670 bp = a->a_bp; 5671 5672 /* 5673 * Cluster ops lock their component buffers but not the IO container. 5674 */ 5675 if ((bp->b_flags & B_CLUSTER) != 0) 5676 return; 5677 5678 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 5679 if (vfs_badlock_print) 5680 printf( 5681 "VOP_STRATEGY: bp is not locked but should be\n"); 5682 if (vfs_badlock_ddb) 5683 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5684 } 5685 } 5686 5687 void 5688 vop_lock_debugpre(void *ap) 5689 { 5690 struct vop_lock1_args *a = ap; 5691 5692 if ((a->a_flags & LK_INTERLOCK) == 0) 5693 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5694 else 5695 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 5696 } 5697 5698 void 5699 vop_lock_debugpost(void *ap, int rc) 5700 { 5701 struct vop_lock1_args *a = ap; 5702 5703 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5704 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 5705 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 5706 } 5707 5708 void 5709 vop_unlock_debugpre(void *ap) 5710 { 5711 struct vop_unlock_args *a = ap; 5712 5713 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 5714 } 5715 5716 void 5717 vop_need_inactive_debugpre(void *ap) 5718 { 5719 struct vop_need_inactive_args *a = ap; 5720 5721 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5722 } 5723 5724 void 5725 vop_need_inactive_debugpost(void *ap, int rc) 5726 { 5727 struct vop_need_inactive_args *a = ap; 5728 5729 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5730 } 5731 #endif 5732 5733 void 5734 vop_create_pre(void *ap) 5735 { 5736 struct vop_create_args *a; 5737 struct vnode *dvp; 5738 5739 a = ap; 5740 dvp = a->a_dvp; 5741 vn_seqc_write_begin(dvp); 5742 } 5743 5744 void 5745 vop_create_post(void *ap, int rc) 5746 { 5747 struct vop_create_args *a; 5748 struct vnode *dvp; 5749 5750 a = ap; 5751 dvp = a->a_dvp; 5752 vn_seqc_write_end(dvp); 5753 if (!rc) 5754 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5755 } 5756 5757 void 5758 vop_whiteout_pre(void *ap) 5759 { 5760 struct vop_whiteout_args *a; 5761 struct vnode *dvp; 5762 5763 a = ap; 5764 dvp = a->a_dvp; 5765 vn_seqc_write_begin(dvp); 5766 } 5767 5768 void 5769 vop_whiteout_post(void *ap, int rc) 5770 { 5771 struct vop_whiteout_args *a; 5772 struct vnode *dvp; 5773 5774 a = ap; 5775 dvp = a->a_dvp; 5776 vn_seqc_write_end(dvp); 5777 } 5778 5779 void 5780 vop_deleteextattr_pre(void *ap) 5781 { 5782 struct vop_deleteextattr_args *a; 5783 struct vnode *vp; 5784 5785 a = ap; 5786 vp = a->a_vp; 5787 vn_seqc_write_begin(vp); 5788 } 5789 5790 void 5791 vop_deleteextattr_post(void *ap, int rc) 5792 { 5793 struct vop_deleteextattr_args *a; 5794 struct vnode *vp; 5795 5796 a = ap; 5797 vp = a->a_vp; 5798 vn_seqc_write_end(vp); 5799 if (!rc) 5800 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5801 } 5802 5803 void 5804 vop_link_pre(void *ap) 5805 { 5806 struct vop_link_args *a; 5807 struct vnode *vp, *tdvp; 5808 5809 a = ap; 5810 vp = a->a_vp; 5811 tdvp = a->a_tdvp; 5812 vn_seqc_write_begin(vp); 5813 vn_seqc_write_begin(tdvp); 5814 } 5815 5816 void 5817 vop_link_post(void *ap, int rc) 5818 { 5819 struct vop_link_args *a; 5820 struct vnode *vp, *tdvp; 5821 5822 a = ap; 5823 vp = a->a_vp; 5824 tdvp = a->a_tdvp; 5825 vn_seqc_write_end(vp); 5826 vn_seqc_write_end(tdvp); 5827 if (!rc) { 5828 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 5829 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 5830 } 5831 } 5832 5833 void 5834 vop_mkdir_pre(void *ap) 5835 { 5836 struct vop_mkdir_args *a; 5837 struct vnode *dvp; 5838 5839 a = ap; 5840 dvp = a->a_dvp; 5841 vn_seqc_write_begin(dvp); 5842 } 5843 5844 void 5845 vop_mkdir_post(void *ap, int rc) 5846 { 5847 struct vop_mkdir_args *a; 5848 struct vnode *dvp; 5849 5850 a = ap; 5851 dvp = a->a_dvp; 5852 vn_seqc_write_end(dvp); 5853 if (!rc) 5854 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5855 } 5856 5857 #ifdef DEBUG_VFS_LOCKS 5858 void 5859 vop_mkdir_debugpost(void *ap, int rc) 5860 { 5861 struct vop_mkdir_args *a; 5862 5863 a = ap; 5864 if (!rc) 5865 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); 5866 } 5867 #endif 5868 5869 void 5870 vop_mknod_pre(void *ap) 5871 { 5872 struct vop_mknod_args *a; 5873 struct vnode *dvp; 5874 5875 a = ap; 5876 dvp = a->a_dvp; 5877 vn_seqc_write_begin(dvp); 5878 } 5879 5880 void 5881 vop_mknod_post(void *ap, int rc) 5882 { 5883 struct vop_mknod_args *a; 5884 struct vnode *dvp; 5885 5886 a = ap; 5887 dvp = a->a_dvp; 5888 vn_seqc_write_end(dvp); 5889 if (!rc) 5890 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5891 } 5892 5893 void 5894 vop_reclaim_post(void *ap, int rc) 5895 { 5896 struct vop_reclaim_args *a; 5897 struct vnode *vp; 5898 5899 a = ap; 5900 vp = a->a_vp; 5901 ASSERT_VOP_IN_SEQC(vp); 5902 if (!rc) 5903 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 5904 } 5905 5906 void 5907 vop_remove_pre(void *ap) 5908 { 5909 struct vop_remove_args *a; 5910 struct vnode *dvp, *vp; 5911 5912 a = ap; 5913 dvp = a->a_dvp; 5914 vp = a->a_vp; 5915 vn_seqc_write_begin(dvp); 5916 vn_seqc_write_begin(vp); 5917 } 5918 5919 void 5920 vop_remove_post(void *ap, int rc) 5921 { 5922 struct vop_remove_args *a; 5923 struct vnode *dvp, *vp; 5924 5925 a = ap; 5926 dvp = a->a_dvp; 5927 vp = a->a_vp; 5928 vn_seqc_write_end(dvp); 5929 vn_seqc_write_end(vp); 5930 if (!rc) { 5931 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5932 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5933 } 5934 } 5935 5936 void 5937 vop_rename_post(void *ap, int rc) 5938 { 5939 struct vop_rename_args *a = ap; 5940 long hint; 5941 5942 if (!rc) { 5943 hint = NOTE_WRITE; 5944 if (a->a_fdvp == a->a_tdvp) { 5945 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 5946 hint |= NOTE_LINK; 5947 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5948 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5949 } else { 5950 hint |= NOTE_EXTEND; 5951 if (a->a_fvp->v_type == VDIR) 5952 hint |= NOTE_LINK; 5953 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5954 5955 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 5956 a->a_tvp->v_type == VDIR) 5957 hint &= ~NOTE_LINK; 5958 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5959 } 5960 5961 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 5962 if (a->a_tvp) 5963 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 5964 } 5965 if (a->a_tdvp != a->a_fdvp) 5966 vdrop(a->a_fdvp); 5967 if (a->a_tvp != a->a_fvp) 5968 vdrop(a->a_fvp); 5969 vdrop(a->a_tdvp); 5970 if (a->a_tvp) 5971 vdrop(a->a_tvp); 5972 } 5973 5974 void 5975 vop_rmdir_pre(void *ap) 5976 { 5977 struct vop_rmdir_args *a; 5978 struct vnode *dvp, *vp; 5979 5980 a = ap; 5981 dvp = a->a_dvp; 5982 vp = a->a_vp; 5983 vn_seqc_write_begin(dvp); 5984 vn_seqc_write_begin(vp); 5985 } 5986 5987 void 5988 vop_rmdir_post(void *ap, int rc) 5989 { 5990 struct vop_rmdir_args *a; 5991 struct vnode *dvp, *vp; 5992 5993 a = ap; 5994 dvp = a->a_dvp; 5995 vp = a->a_vp; 5996 vn_seqc_write_end(dvp); 5997 vn_seqc_write_end(vp); 5998 if (!rc) { 5999 vp->v_vflag |= VV_UNLINKED; 6000 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6001 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6002 } 6003 } 6004 6005 void 6006 vop_setattr_pre(void *ap) 6007 { 6008 struct vop_setattr_args *a; 6009 struct vnode *vp; 6010 6011 a = ap; 6012 vp = a->a_vp; 6013 vn_seqc_write_begin(vp); 6014 } 6015 6016 void 6017 vop_setattr_post(void *ap, int rc) 6018 { 6019 struct vop_setattr_args *a; 6020 struct vnode *vp; 6021 6022 a = ap; 6023 vp = a->a_vp; 6024 vn_seqc_write_end(vp); 6025 if (!rc) 6026 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6027 } 6028 6029 void 6030 vop_setacl_pre(void *ap) 6031 { 6032 struct vop_setacl_args *a; 6033 struct vnode *vp; 6034 6035 a = ap; 6036 vp = a->a_vp; 6037 vn_seqc_write_begin(vp); 6038 } 6039 6040 void 6041 vop_setacl_post(void *ap, int rc __unused) 6042 { 6043 struct vop_setacl_args *a; 6044 struct vnode *vp; 6045 6046 a = ap; 6047 vp = a->a_vp; 6048 vn_seqc_write_end(vp); 6049 } 6050 6051 void 6052 vop_setextattr_pre(void *ap) 6053 { 6054 struct vop_setextattr_args *a; 6055 struct vnode *vp; 6056 6057 a = ap; 6058 vp = a->a_vp; 6059 vn_seqc_write_begin(vp); 6060 } 6061 6062 void 6063 vop_setextattr_post(void *ap, int rc) 6064 { 6065 struct vop_setextattr_args *a; 6066 struct vnode *vp; 6067 6068 a = ap; 6069 vp = a->a_vp; 6070 vn_seqc_write_end(vp); 6071 if (!rc) 6072 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6073 } 6074 6075 void 6076 vop_symlink_pre(void *ap) 6077 { 6078 struct vop_symlink_args *a; 6079 struct vnode *dvp; 6080 6081 a = ap; 6082 dvp = a->a_dvp; 6083 vn_seqc_write_begin(dvp); 6084 } 6085 6086 void 6087 vop_symlink_post(void *ap, int rc) 6088 { 6089 struct vop_symlink_args *a; 6090 struct vnode *dvp; 6091 6092 a = ap; 6093 dvp = a->a_dvp; 6094 vn_seqc_write_end(dvp); 6095 if (!rc) 6096 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6097 } 6098 6099 void 6100 vop_open_post(void *ap, int rc) 6101 { 6102 struct vop_open_args *a = ap; 6103 6104 if (!rc) 6105 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6106 } 6107 6108 void 6109 vop_close_post(void *ap, int rc) 6110 { 6111 struct vop_close_args *a = ap; 6112 6113 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6114 !VN_IS_DOOMED(a->a_vp))) { 6115 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6116 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6117 } 6118 } 6119 6120 void 6121 vop_read_post(void *ap, int rc) 6122 { 6123 struct vop_read_args *a = ap; 6124 6125 if (!rc) 6126 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6127 } 6128 6129 void 6130 vop_read_pgcache_post(void *ap, int rc) 6131 { 6132 struct vop_read_pgcache_args *a = ap; 6133 6134 if (!rc) 6135 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); 6136 } 6137 6138 void 6139 vop_readdir_post(void *ap, int rc) 6140 { 6141 struct vop_readdir_args *a = ap; 6142 6143 if (!rc) 6144 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6145 } 6146 6147 static struct knlist fs_knlist; 6148 6149 static void 6150 vfs_event_init(void *arg) 6151 { 6152 knlist_init_mtx(&fs_knlist, NULL); 6153 } 6154 /* XXX - correct order? */ 6155 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6156 6157 void 6158 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6159 { 6160 6161 KNOTE_UNLOCKED(&fs_knlist, event); 6162 } 6163 6164 static int filt_fsattach(struct knote *kn); 6165 static void filt_fsdetach(struct knote *kn); 6166 static int filt_fsevent(struct knote *kn, long hint); 6167 6168 struct filterops fs_filtops = { 6169 .f_isfd = 0, 6170 .f_attach = filt_fsattach, 6171 .f_detach = filt_fsdetach, 6172 .f_event = filt_fsevent 6173 }; 6174 6175 static int 6176 filt_fsattach(struct knote *kn) 6177 { 6178 6179 kn->kn_flags |= EV_CLEAR; 6180 knlist_add(&fs_knlist, kn, 0); 6181 return (0); 6182 } 6183 6184 static void 6185 filt_fsdetach(struct knote *kn) 6186 { 6187 6188 knlist_remove(&fs_knlist, kn, 0); 6189 } 6190 6191 static int 6192 filt_fsevent(struct knote *kn, long hint) 6193 { 6194 6195 kn->kn_fflags |= kn->kn_sfflags & hint; 6196 6197 return (kn->kn_fflags != 0); 6198 } 6199 6200 static int 6201 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6202 { 6203 struct vfsidctl vc; 6204 int error; 6205 struct mount *mp; 6206 6207 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6208 if (error) 6209 return (error); 6210 if (vc.vc_vers != VFS_CTL_VERS1) 6211 return (EINVAL); 6212 mp = vfs_getvfs(&vc.vc_fsid); 6213 if (mp == NULL) 6214 return (ENOENT); 6215 /* ensure that a specific sysctl goes to the right filesystem. */ 6216 if (strcmp(vc.vc_fstypename, "*") != 0 && 6217 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6218 vfs_rel(mp); 6219 return (EINVAL); 6220 } 6221 VCTLTOREQ(&vc, req); 6222 error = VFS_SYSCTL(mp, vc.vc_op, req); 6223 vfs_rel(mp); 6224 return (error); 6225 } 6226 6227 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6228 NULL, 0, sysctl_vfs_ctl, "", 6229 "Sysctl by fsid"); 6230 6231 /* 6232 * Function to initialize a va_filerev field sensibly. 6233 * XXX: Wouldn't a random number make a lot more sense ?? 6234 */ 6235 u_quad_t 6236 init_va_filerev(void) 6237 { 6238 struct bintime bt; 6239 6240 getbinuptime(&bt); 6241 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6242 } 6243 6244 static int filt_vfsread(struct knote *kn, long hint); 6245 static int filt_vfswrite(struct knote *kn, long hint); 6246 static int filt_vfsvnode(struct knote *kn, long hint); 6247 static void filt_vfsdetach(struct knote *kn); 6248 static struct filterops vfsread_filtops = { 6249 .f_isfd = 1, 6250 .f_detach = filt_vfsdetach, 6251 .f_event = filt_vfsread 6252 }; 6253 static struct filterops vfswrite_filtops = { 6254 .f_isfd = 1, 6255 .f_detach = filt_vfsdetach, 6256 .f_event = filt_vfswrite 6257 }; 6258 static struct filterops vfsvnode_filtops = { 6259 .f_isfd = 1, 6260 .f_detach = filt_vfsdetach, 6261 .f_event = filt_vfsvnode 6262 }; 6263 6264 static void 6265 vfs_knllock(void *arg) 6266 { 6267 struct vnode *vp = arg; 6268 6269 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6270 } 6271 6272 static void 6273 vfs_knlunlock(void *arg) 6274 { 6275 struct vnode *vp = arg; 6276 6277 VOP_UNLOCK(vp); 6278 } 6279 6280 static void 6281 vfs_knl_assert_lock(void *arg, int what) 6282 { 6283 #ifdef DEBUG_VFS_LOCKS 6284 struct vnode *vp = arg; 6285 6286 if (what == LA_LOCKED) 6287 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6288 else 6289 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6290 #endif 6291 } 6292 6293 int 6294 vfs_kqfilter(struct vop_kqfilter_args *ap) 6295 { 6296 struct vnode *vp = ap->a_vp; 6297 struct knote *kn = ap->a_kn; 6298 struct knlist *knl; 6299 6300 KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && 6301 kn->kn_filter != EVFILT_WRITE), 6302 ("READ/WRITE filter on a FIFO leaked through")); 6303 switch (kn->kn_filter) { 6304 case EVFILT_READ: 6305 kn->kn_fop = &vfsread_filtops; 6306 break; 6307 case EVFILT_WRITE: 6308 kn->kn_fop = &vfswrite_filtops; 6309 break; 6310 case EVFILT_VNODE: 6311 kn->kn_fop = &vfsvnode_filtops; 6312 break; 6313 default: 6314 return (EINVAL); 6315 } 6316 6317 kn->kn_hook = (caddr_t)vp; 6318 6319 v_addpollinfo(vp); 6320 if (vp->v_pollinfo == NULL) 6321 return (ENOMEM); 6322 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6323 vhold(vp); 6324 knlist_add(knl, kn, 0); 6325 6326 return (0); 6327 } 6328 6329 /* 6330 * Detach knote from vnode 6331 */ 6332 static void 6333 filt_vfsdetach(struct knote *kn) 6334 { 6335 struct vnode *vp = (struct vnode *)kn->kn_hook; 6336 6337 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6338 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6339 vdrop(vp); 6340 } 6341 6342 /*ARGSUSED*/ 6343 static int 6344 filt_vfsread(struct knote *kn, long hint) 6345 { 6346 struct vnode *vp = (struct vnode *)kn->kn_hook; 6347 struct vattr va; 6348 int res; 6349 6350 /* 6351 * filesystem is gone, so set the EOF flag and schedule 6352 * the knote for deletion. 6353 */ 6354 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6355 VI_LOCK(vp); 6356 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6357 VI_UNLOCK(vp); 6358 return (1); 6359 } 6360 6361 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 6362 return (0); 6363 6364 VI_LOCK(vp); 6365 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 6366 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6367 VI_UNLOCK(vp); 6368 return (res); 6369 } 6370 6371 /*ARGSUSED*/ 6372 static int 6373 filt_vfswrite(struct knote *kn, long hint) 6374 { 6375 struct vnode *vp = (struct vnode *)kn->kn_hook; 6376 6377 VI_LOCK(vp); 6378 6379 /* 6380 * filesystem is gone, so set the EOF flag and schedule 6381 * the knote for deletion. 6382 */ 6383 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6384 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6385 6386 kn->kn_data = 0; 6387 VI_UNLOCK(vp); 6388 return (1); 6389 } 6390 6391 static int 6392 filt_vfsvnode(struct knote *kn, long hint) 6393 { 6394 struct vnode *vp = (struct vnode *)kn->kn_hook; 6395 int res; 6396 6397 VI_LOCK(vp); 6398 if (kn->kn_sfflags & hint) 6399 kn->kn_fflags |= hint; 6400 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6401 kn->kn_flags |= EV_EOF; 6402 VI_UNLOCK(vp); 6403 return (1); 6404 } 6405 res = (kn->kn_fflags != 0); 6406 VI_UNLOCK(vp); 6407 return (res); 6408 } 6409 6410 /* 6411 * Returns whether the directory is empty or not. 6412 * If it is empty, the return value is 0; otherwise 6413 * the return value is an error value (which may 6414 * be ENOTEMPTY). 6415 */ 6416 int 6417 vfs_emptydir(struct vnode *vp) 6418 { 6419 struct uio uio; 6420 struct iovec iov; 6421 struct dirent *dirent, *dp, *endp; 6422 int error, eof; 6423 6424 error = 0; 6425 eof = 0; 6426 6427 ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); 6428 VNASSERT(vp->v_type == VDIR, vp, ("vp is not a directory")); 6429 6430 dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); 6431 iov.iov_base = dirent; 6432 iov.iov_len = sizeof(struct dirent); 6433 6434 uio.uio_iov = &iov; 6435 uio.uio_iovcnt = 1; 6436 uio.uio_offset = 0; 6437 uio.uio_resid = sizeof(struct dirent); 6438 uio.uio_segflg = UIO_SYSSPACE; 6439 uio.uio_rw = UIO_READ; 6440 uio.uio_td = curthread; 6441 6442 while (eof == 0 && error == 0) { 6443 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, 6444 NULL, NULL); 6445 if (error != 0) 6446 break; 6447 endp = (void *)((uint8_t *)dirent + 6448 sizeof(struct dirent) - uio.uio_resid); 6449 for (dp = dirent; dp < endp; 6450 dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { 6451 if (dp->d_type == DT_WHT) 6452 continue; 6453 if (dp->d_namlen == 0) 6454 continue; 6455 if (dp->d_type != DT_DIR && 6456 dp->d_type != DT_UNKNOWN) { 6457 error = ENOTEMPTY; 6458 break; 6459 } 6460 if (dp->d_namlen > 2) { 6461 error = ENOTEMPTY; 6462 break; 6463 } 6464 if (dp->d_namlen == 1 && 6465 dp->d_name[0] != '.') { 6466 error = ENOTEMPTY; 6467 break; 6468 } 6469 if (dp->d_namlen == 2 && 6470 dp->d_name[1] != '.') { 6471 error = ENOTEMPTY; 6472 break; 6473 } 6474 uio.uio_resid = sizeof(struct dirent); 6475 } 6476 } 6477 free(dirent, M_TEMP); 6478 return (error); 6479 } 6480 6481 int 6482 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6483 { 6484 int error; 6485 6486 if (dp->d_reclen > ap->a_uio->uio_resid) 6487 return (ENAMETOOLONG); 6488 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6489 if (error) { 6490 if (ap->a_ncookies != NULL) { 6491 if (ap->a_cookies != NULL) 6492 free(ap->a_cookies, M_TEMP); 6493 ap->a_cookies = NULL; 6494 *ap->a_ncookies = 0; 6495 } 6496 return (error); 6497 } 6498 if (ap->a_ncookies == NULL) 6499 return (0); 6500 6501 KASSERT(ap->a_cookies, 6502 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6503 6504 *ap->a_cookies = realloc(*ap->a_cookies, 6505 (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); 6506 (*ap->a_cookies)[*ap->a_ncookies] = off; 6507 *ap->a_ncookies += 1; 6508 return (0); 6509 } 6510 6511 /* 6512 * The purpose of this routine is to remove granularity from accmode_t, 6513 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6514 * VADMIN and VAPPEND. 6515 * 6516 * If it returns 0, the caller is supposed to continue with the usual 6517 * access checks using 'accmode' as modified by this routine. If it 6518 * returns nonzero value, the caller is supposed to return that value 6519 * as errno. 6520 * 6521 * Note that after this routine runs, accmode may be zero. 6522 */ 6523 int 6524 vfs_unixify_accmode(accmode_t *accmode) 6525 { 6526 /* 6527 * There is no way to specify explicit "deny" rule using 6528 * file mode or POSIX.1e ACLs. 6529 */ 6530 if (*accmode & VEXPLICIT_DENY) { 6531 *accmode = 0; 6532 return (0); 6533 } 6534 6535 /* 6536 * None of these can be translated into usual access bits. 6537 * Also, the common case for NFSv4 ACLs is to not contain 6538 * either of these bits. Caller should check for VWRITE 6539 * on the containing directory instead. 6540 */ 6541 if (*accmode & (VDELETE_CHILD | VDELETE)) 6542 return (EPERM); 6543 6544 if (*accmode & VADMIN_PERMS) { 6545 *accmode &= ~VADMIN_PERMS; 6546 *accmode |= VADMIN; 6547 } 6548 6549 /* 6550 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6551 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6552 */ 6553 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6554 6555 return (0); 6556 } 6557 6558 /* 6559 * Clear out a doomed vnode (if any) and replace it with a new one as long 6560 * as the fs is not being unmounted. Return the root vnode to the caller. 6561 */ 6562 static int __noinline 6563 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6564 { 6565 struct vnode *vp; 6566 int error; 6567 6568 restart: 6569 if (mp->mnt_rootvnode != NULL) { 6570 MNT_ILOCK(mp); 6571 vp = mp->mnt_rootvnode; 6572 if (vp != NULL) { 6573 if (!VN_IS_DOOMED(vp)) { 6574 vrefact(vp); 6575 MNT_IUNLOCK(mp); 6576 error = vn_lock(vp, flags); 6577 if (error == 0) { 6578 *vpp = vp; 6579 return (0); 6580 } 6581 vrele(vp); 6582 goto restart; 6583 } 6584 /* 6585 * Clear the old one. 6586 */ 6587 mp->mnt_rootvnode = NULL; 6588 } 6589 MNT_IUNLOCK(mp); 6590 if (vp != NULL) { 6591 vfs_op_barrier_wait(mp); 6592 vrele(vp); 6593 } 6594 } 6595 error = VFS_CACHEDROOT(mp, flags, vpp); 6596 if (error != 0) 6597 return (error); 6598 if (mp->mnt_vfs_ops == 0) { 6599 MNT_ILOCK(mp); 6600 if (mp->mnt_vfs_ops != 0) { 6601 MNT_IUNLOCK(mp); 6602 return (0); 6603 } 6604 if (mp->mnt_rootvnode == NULL) { 6605 vrefact(*vpp); 6606 mp->mnt_rootvnode = *vpp; 6607 } else { 6608 if (mp->mnt_rootvnode != *vpp) { 6609 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6610 panic("%s: mismatch between vnode returned " 6611 " by VFS_CACHEDROOT and the one cached " 6612 " (%p != %p)", 6613 __func__, *vpp, mp->mnt_rootvnode); 6614 } 6615 } 6616 } 6617 MNT_IUNLOCK(mp); 6618 } 6619 return (0); 6620 } 6621 6622 int 6623 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6624 { 6625 struct mount_pcpu *mpcpu; 6626 struct vnode *vp; 6627 int error; 6628 6629 if (!vfs_op_thread_enter(mp, mpcpu)) 6630 return (vfs_cache_root_fallback(mp, flags, vpp)); 6631 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6632 if (vp == NULL || VN_IS_DOOMED(vp)) { 6633 vfs_op_thread_exit(mp, mpcpu); 6634 return (vfs_cache_root_fallback(mp, flags, vpp)); 6635 } 6636 vrefact(vp); 6637 vfs_op_thread_exit(mp, mpcpu); 6638 error = vn_lock(vp, flags); 6639 if (error != 0) { 6640 vrele(vp); 6641 return (vfs_cache_root_fallback(mp, flags, vpp)); 6642 } 6643 *vpp = vp; 6644 return (0); 6645 } 6646 6647 struct vnode * 6648 vfs_cache_root_clear(struct mount *mp) 6649 { 6650 struct vnode *vp; 6651 6652 /* 6653 * ops > 0 guarantees there is nobody who can see this vnode 6654 */ 6655 MPASS(mp->mnt_vfs_ops > 0); 6656 vp = mp->mnt_rootvnode; 6657 if (vp != NULL) 6658 vn_seqc_write_begin(vp); 6659 mp->mnt_rootvnode = NULL; 6660 return (vp); 6661 } 6662 6663 void 6664 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 6665 { 6666 6667 MPASS(mp->mnt_vfs_ops > 0); 6668 vrefact(vp); 6669 mp->mnt_rootvnode = vp; 6670 } 6671 6672 /* 6673 * These are helper functions for filesystems to traverse all 6674 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 6675 * 6676 * This interface replaces MNT_VNODE_FOREACH. 6677 */ 6678 6679 struct vnode * 6680 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 6681 { 6682 struct vnode *vp; 6683 6684 if (should_yield()) 6685 kern_yield(PRI_USER); 6686 MNT_ILOCK(mp); 6687 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6688 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 6689 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 6690 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6691 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6692 continue; 6693 VI_LOCK(vp); 6694 if (VN_IS_DOOMED(vp)) { 6695 VI_UNLOCK(vp); 6696 continue; 6697 } 6698 break; 6699 } 6700 if (vp == NULL) { 6701 __mnt_vnode_markerfree_all(mvp, mp); 6702 /* MNT_IUNLOCK(mp); -- done in above function */ 6703 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 6704 return (NULL); 6705 } 6706 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6707 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6708 MNT_IUNLOCK(mp); 6709 return (vp); 6710 } 6711 6712 struct vnode * 6713 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 6714 { 6715 struct vnode *vp; 6716 6717 *mvp = vn_alloc_marker(mp); 6718 MNT_ILOCK(mp); 6719 MNT_REF(mp); 6720 6721 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 6722 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6723 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6724 continue; 6725 VI_LOCK(vp); 6726 if (VN_IS_DOOMED(vp)) { 6727 VI_UNLOCK(vp); 6728 continue; 6729 } 6730 break; 6731 } 6732 if (vp == NULL) { 6733 MNT_REL(mp); 6734 MNT_IUNLOCK(mp); 6735 vn_free_marker(*mvp); 6736 *mvp = NULL; 6737 return (NULL); 6738 } 6739 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6740 MNT_IUNLOCK(mp); 6741 return (vp); 6742 } 6743 6744 void 6745 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 6746 { 6747 6748 if (*mvp == NULL) { 6749 MNT_IUNLOCK(mp); 6750 return; 6751 } 6752 6753 mtx_assert(MNT_MTX(mp), MA_OWNED); 6754 6755 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6756 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6757 MNT_REL(mp); 6758 MNT_IUNLOCK(mp); 6759 vn_free_marker(*mvp); 6760 *mvp = NULL; 6761 } 6762 6763 /* 6764 * These are helper functions for filesystems to traverse their 6765 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 6766 */ 6767 static void 6768 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6769 { 6770 6771 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6772 6773 MNT_ILOCK(mp); 6774 MNT_REL(mp); 6775 MNT_IUNLOCK(mp); 6776 vn_free_marker(*mvp); 6777 *mvp = NULL; 6778 } 6779 6780 /* 6781 * Relock the mp mount vnode list lock with the vp vnode interlock in the 6782 * conventional lock order during mnt_vnode_next_lazy iteration. 6783 * 6784 * On entry, the mount vnode list lock is held and the vnode interlock is not. 6785 * The list lock is dropped and reacquired. On success, both locks are held. 6786 * On failure, the mount vnode list lock is held but the vnode interlock is 6787 * not, and the procedure may have yielded. 6788 */ 6789 static bool 6790 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 6791 struct vnode *vp) 6792 { 6793 6794 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 6795 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 6796 ("%s: bad marker", __func__)); 6797 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 6798 ("%s: inappropriate vnode", __func__)); 6799 ASSERT_VI_UNLOCKED(vp, __func__); 6800 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6801 6802 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 6803 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 6804 6805 /* 6806 * Note we may be racing against vdrop which transitioned the hold 6807 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 6808 * if we are the only user after we get the interlock we will just 6809 * vdrop. 6810 */ 6811 vhold(vp); 6812 mtx_unlock(&mp->mnt_listmtx); 6813 VI_LOCK(vp); 6814 if (VN_IS_DOOMED(vp)) { 6815 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 6816 goto out_lost; 6817 } 6818 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 6819 /* 6820 * There is nothing to do if we are the last user. 6821 */ 6822 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 6823 goto out_lost; 6824 mtx_lock(&mp->mnt_listmtx); 6825 return (true); 6826 out_lost: 6827 vdropl(vp); 6828 maybe_yield(); 6829 mtx_lock(&mp->mnt_listmtx); 6830 return (false); 6831 } 6832 6833 static struct vnode * 6834 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6835 void *cbarg) 6836 { 6837 struct vnode *vp; 6838 6839 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6840 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6841 restart: 6842 vp = TAILQ_NEXT(*mvp, v_lazylist); 6843 while (vp != NULL) { 6844 if (vp->v_type == VMARKER) { 6845 vp = TAILQ_NEXT(vp, v_lazylist); 6846 continue; 6847 } 6848 /* 6849 * See if we want to process the vnode. Note we may encounter a 6850 * long string of vnodes we don't care about and hog the list 6851 * as a result. Check for it and requeue the marker. 6852 */ 6853 VNPASS(!VN_IS_DOOMED(vp), vp); 6854 if (!cb(vp, cbarg)) { 6855 if (!should_yield()) { 6856 vp = TAILQ_NEXT(vp, v_lazylist); 6857 continue; 6858 } 6859 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 6860 v_lazylist); 6861 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 6862 v_lazylist); 6863 mtx_unlock(&mp->mnt_listmtx); 6864 kern_yield(PRI_USER); 6865 mtx_lock(&mp->mnt_listmtx); 6866 goto restart; 6867 } 6868 /* 6869 * Try-lock because this is the wrong lock order. 6870 */ 6871 if (!VI_TRYLOCK(vp) && 6872 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 6873 goto restart; 6874 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 6875 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 6876 ("alien vnode on the lazy list %p %p", vp, mp)); 6877 VNPASS(vp->v_mount == mp, vp); 6878 VNPASS(!VN_IS_DOOMED(vp), vp); 6879 break; 6880 } 6881 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6882 6883 /* Check if we are done */ 6884 if (vp == NULL) { 6885 mtx_unlock(&mp->mnt_listmtx); 6886 mnt_vnode_markerfree_lazy(mvp, mp); 6887 return (NULL); 6888 } 6889 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 6890 mtx_unlock(&mp->mnt_listmtx); 6891 ASSERT_VI_LOCKED(vp, "lazy iter"); 6892 return (vp); 6893 } 6894 6895 struct vnode * 6896 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6897 void *cbarg) 6898 { 6899 6900 if (should_yield()) 6901 kern_yield(PRI_USER); 6902 mtx_lock(&mp->mnt_listmtx); 6903 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6904 } 6905 6906 struct vnode * 6907 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6908 void *cbarg) 6909 { 6910 struct vnode *vp; 6911 6912 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 6913 return (NULL); 6914 6915 *mvp = vn_alloc_marker(mp); 6916 MNT_ILOCK(mp); 6917 MNT_REF(mp); 6918 MNT_IUNLOCK(mp); 6919 6920 mtx_lock(&mp->mnt_listmtx); 6921 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 6922 if (vp == NULL) { 6923 mtx_unlock(&mp->mnt_listmtx); 6924 mnt_vnode_markerfree_lazy(mvp, mp); 6925 return (NULL); 6926 } 6927 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 6928 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6929 } 6930 6931 void 6932 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6933 { 6934 6935 if (*mvp == NULL) 6936 return; 6937 6938 mtx_lock(&mp->mnt_listmtx); 6939 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6940 mtx_unlock(&mp->mnt_listmtx); 6941 mnt_vnode_markerfree_lazy(mvp, mp); 6942 } 6943 6944 int 6945 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 6946 { 6947 6948 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 6949 cnp->cn_flags &= ~NOEXECCHECK; 6950 return (0); 6951 } 6952 6953 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); 6954 } 6955 6956 /* 6957 * Do not use this variant unless you have means other than the hold count 6958 * to prevent the vnode from getting freed. 6959 */ 6960 void 6961 vn_seqc_write_begin_locked(struct vnode *vp) 6962 { 6963 6964 ASSERT_VI_LOCKED(vp, __func__); 6965 VNPASS(vp->v_holdcnt > 0, vp); 6966 VNPASS(vp->v_seqc_users >= 0, vp); 6967 vp->v_seqc_users++; 6968 if (vp->v_seqc_users == 1) 6969 seqc_sleepable_write_begin(&vp->v_seqc); 6970 } 6971 6972 void 6973 vn_seqc_write_begin(struct vnode *vp) 6974 { 6975 6976 VI_LOCK(vp); 6977 vn_seqc_write_begin_locked(vp); 6978 VI_UNLOCK(vp); 6979 } 6980 6981 void 6982 vn_seqc_write_end_locked(struct vnode *vp) 6983 { 6984 6985 ASSERT_VI_LOCKED(vp, __func__); 6986 VNPASS(vp->v_seqc_users > 0, vp); 6987 vp->v_seqc_users--; 6988 if (vp->v_seqc_users == 0) 6989 seqc_sleepable_write_end(&vp->v_seqc); 6990 } 6991 6992 void 6993 vn_seqc_write_end(struct vnode *vp) 6994 { 6995 6996 VI_LOCK(vp); 6997 vn_seqc_write_end_locked(vp); 6998 VI_UNLOCK(vp); 6999 } 7000 7001 /* 7002 * Special case handling for allocating and freeing vnodes. 7003 * 7004 * The counter remains unchanged on free so that a doomed vnode will 7005 * keep testing as in modify as long as it is accessible with SMR. 7006 */ 7007 static void 7008 vn_seqc_init(struct vnode *vp) 7009 { 7010 7011 vp->v_seqc = 0; 7012 vp->v_seqc_users = 0; 7013 } 7014 7015 static void 7016 vn_seqc_write_end_free(struct vnode *vp) 7017 { 7018 7019 VNPASS(seqc_in_modify(vp->v_seqc), vp); 7020 VNPASS(vp->v_seqc_users == 1, vp); 7021 } 7022 7023 void 7024 vn_irflag_set_locked(struct vnode *vp, short toset) 7025 { 7026 short flags; 7027 7028 ASSERT_VI_LOCKED(vp, __func__); 7029 flags = vn_irflag_read(vp); 7030 VNASSERT((flags & toset) == 0, vp, 7031 ("%s: some of the passed flags already set (have %d, passed %d)\n", 7032 __func__, flags, toset)); 7033 atomic_store_short(&vp->v_irflag, flags | toset); 7034 } 7035 7036 void 7037 vn_irflag_set(struct vnode *vp, short toset) 7038 { 7039 7040 VI_LOCK(vp); 7041 vn_irflag_set_locked(vp, toset); 7042 VI_UNLOCK(vp); 7043 } 7044 7045 void 7046 vn_irflag_set_cond_locked(struct vnode *vp, short toset) 7047 { 7048 short flags; 7049 7050 ASSERT_VI_LOCKED(vp, __func__); 7051 flags = vn_irflag_read(vp); 7052 atomic_store_short(&vp->v_irflag, flags | toset); 7053 } 7054 7055 void 7056 vn_irflag_set_cond(struct vnode *vp, short toset) 7057 { 7058 7059 VI_LOCK(vp); 7060 vn_irflag_set_cond_locked(vp, toset); 7061 VI_UNLOCK(vp); 7062 } 7063 7064 void 7065 vn_irflag_unset_locked(struct vnode *vp, short tounset) 7066 { 7067 short flags; 7068 7069 ASSERT_VI_LOCKED(vp, __func__); 7070 flags = vn_irflag_read(vp); 7071 VNASSERT((flags & tounset) == tounset, vp, 7072 ("%s: some of the passed flags not set (have %d, passed %d)\n", 7073 __func__, flags, tounset)); 7074 atomic_store_short(&vp->v_irflag, flags & ~tounset); 7075 } 7076 7077 void 7078 vn_irflag_unset(struct vnode *vp, short tounset) 7079 { 7080 7081 VI_LOCK(vp); 7082 vn_irflag_unset_locked(vp, tounset); 7083 VI_UNLOCK(vp); 7084 } 7085