1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 37 */ 38 39 /* 40 * External virtual filesystem routines 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_ddb.h" 47 #include "opt_watchdog.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/bio.h> 52 #include <sys/buf.h> 53 #include <sys/capsicum.h> 54 #include <sys/condvar.h> 55 #include <sys/conf.h> 56 #include <sys/counter.h> 57 #include <sys/dirent.h> 58 #include <sys/event.h> 59 #include <sys/eventhandler.h> 60 #include <sys/extattr.h> 61 #include <sys/file.h> 62 #include <sys/fcntl.h> 63 #include <sys/jail.h> 64 #include <sys/kdb.h> 65 #include <sys/kernel.h> 66 #include <sys/kthread.h> 67 #include <sys/ktr.h> 68 #include <sys/lockf.h> 69 #include <sys/malloc.h> 70 #include <sys/mount.h> 71 #include <sys/namei.h> 72 #include <sys/pctrie.h> 73 #include <sys/priv.h> 74 #include <sys/reboot.h> 75 #include <sys/refcount.h> 76 #include <sys/rwlock.h> 77 #include <sys/sched.h> 78 #include <sys/sleepqueue.h> 79 #include <sys/smr.h> 80 #include <sys/smp.h> 81 #include <sys/stat.h> 82 #include <sys/sysctl.h> 83 #include <sys/syslog.h> 84 #include <sys/vmmeter.h> 85 #include <sys/vnode.h> 86 #include <sys/watchdog.h> 87 88 #include <machine/stdarg.h> 89 90 #include <security/mac/mac_framework.h> 91 92 #include <vm/vm.h> 93 #include <vm/vm_object.h> 94 #include <vm/vm_extern.h> 95 #include <vm/pmap.h> 96 #include <vm/vm_map.h> 97 #include <vm/vm_page.h> 98 #include <vm/vm_kern.h> 99 #include <vm/uma.h> 100 101 #ifdef DDB 102 #include <ddb/ddb.h> 103 #endif 104 105 static void delmntque(struct vnode *vp); 106 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 107 int slpflag, int slptimeo); 108 static void syncer_shutdown(void *arg, int howto); 109 static int vtryrecycle(struct vnode *vp); 110 static void v_init_counters(struct vnode *); 111 static void vn_seqc_init(struct vnode *); 112 static void vn_seqc_write_end_free(struct vnode *vp); 113 static void vgonel(struct vnode *); 114 static bool vhold_recycle_free(struct vnode *); 115 static void vfs_knllock(void *arg); 116 static void vfs_knlunlock(void *arg); 117 static void vfs_knl_assert_lock(void *arg, int what); 118 static void destroy_vpollinfo(struct vpollinfo *vi); 119 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 120 daddr_t startlbn, daddr_t endlbn); 121 static void vnlru_recalc(void); 122 123 /* 124 * These fences are intended for cases where some synchronization is 125 * needed between access of v_iflags and lockless vnode refcount (v_holdcnt 126 * and v_usecount) updates. Access to v_iflags is generally synchronized 127 * by the interlock, but we have some internal assertions that check vnode 128 * flags without acquiring the lock. Thus, these fences are INVARIANTS-only 129 * for now. 130 */ 131 #ifdef INVARIANTS 132 #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() 133 #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() 134 #else 135 #define VNODE_REFCOUNT_FENCE_ACQ() 136 #define VNODE_REFCOUNT_FENCE_REL() 137 #endif 138 139 /* 140 * Number of vnodes in existence. Increased whenever getnewvnode() 141 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 142 */ 143 static u_long __exclusive_cache_line numvnodes; 144 145 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 146 "Number of vnodes in existence"); 147 148 static counter_u64_t vnodes_created; 149 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 150 "Number of vnodes created by getnewvnode"); 151 152 /* 153 * Conversion tables for conversion from vnode types to inode formats 154 * and back. 155 */ 156 enum vtype iftovt_tab[16] = { 157 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 158 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 159 }; 160 int vttoif_tab[10] = { 161 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 162 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 163 }; 164 165 /* 166 * List of allocates vnodes in the system. 167 */ 168 static TAILQ_HEAD(freelst, vnode) vnode_list; 169 static struct vnode *vnode_list_free_marker; 170 static struct vnode *vnode_list_reclaim_marker; 171 172 /* 173 * "Free" vnode target. Free vnodes are rarely completely free, but are 174 * just ones that are cheap to recycle. Usually they are for files which 175 * have been stat'd but not read; these usually have inode and namecache 176 * data attached to them. This target is the preferred minimum size of a 177 * sub-cache consisting mostly of such files. The system balances the size 178 * of this sub-cache with its complement to try to prevent either from 179 * thrashing while the other is relatively inactive. The targets express 180 * a preference for the best balance. 181 * 182 * "Above" this target there are 2 further targets (watermarks) related 183 * to recyling of free vnodes. In the best-operating case, the cache is 184 * exactly full, the free list has size between vlowat and vhiwat above the 185 * free target, and recycling from it and normal use maintains this state. 186 * Sometimes the free list is below vlowat or even empty, but this state 187 * is even better for immediate use provided the cache is not full. 188 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 189 * ones) to reach one of these states. The watermarks are currently hard- 190 * coded as 4% and 9% of the available space higher. These and the default 191 * of 25% for wantfreevnodes are too large if the memory size is large. 192 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 193 * whenever vnlru_proc() becomes active. 194 */ 195 static long wantfreevnodes; 196 static long __exclusive_cache_line freevnodes; 197 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 198 &freevnodes, 0, "Number of \"free\" vnodes"); 199 static long freevnodes_old; 200 201 static counter_u64_t recycles_count; 202 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 203 "Number of vnodes recycled to meet vnode cache targets"); 204 205 static counter_u64_t recycles_free_count; 206 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, 207 "Number of free vnodes recycled to meet vnode cache targets"); 208 209 static counter_u64_t deferred_inact; 210 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, 211 "Number of times inactive processing was deferred"); 212 213 /* To keep more than one thread at a time from running vfs_getnewfsid */ 214 static struct mtx mntid_mtx; 215 216 /* 217 * Lock for any access to the following: 218 * vnode_list 219 * numvnodes 220 * freevnodes 221 */ 222 static struct mtx __exclusive_cache_line vnode_list_mtx; 223 224 /* Publicly exported FS */ 225 struct nfs_public nfs_pub; 226 227 static uma_zone_t buf_trie_zone; 228 static smr_t buf_trie_smr; 229 230 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 231 static uma_zone_t vnode_zone; 232 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); 233 234 __read_frequently smr_t vfs_smr; 235 236 /* 237 * The workitem queue. 238 * 239 * It is useful to delay writes of file data and filesystem metadata 240 * for tens of seconds so that quickly created and deleted files need 241 * not waste disk bandwidth being created and removed. To realize this, 242 * we append vnodes to a "workitem" queue. When running with a soft 243 * updates implementation, most pending metadata dependencies should 244 * not wait for more than a few seconds. Thus, mounted on block devices 245 * are delayed only about a half the time that file data is delayed. 246 * Similarly, directory updates are more critical, so are only delayed 247 * about a third the time that file data is delayed. Thus, there are 248 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 249 * one each second (driven off the filesystem syncer process). The 250 * syncer_delayno variable indicates the next queue that is to be processed. 251 * Items that need to be processed soon are placed in this queue: 252 * 253 * syncer_workitem_pending[syncer_delayno] 254 * 255 * A delay of fifteen seconds is done by placing the request fifteen 256 * entries later in the queue: 257 * 258 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 259 * 260 */ 261 static int syncer_delayno; 262 static long syncer_mask; 263 LIST_HEAD(synclist, bufobj); 264 static struct synclist *syncer_workitem_pending; 265 /* 266 * The sync_mtx protects: 267 * bo->bo_synclist 268 * sync_vnode_count 269 * syncer_delayno 270 * syncer_state 271 * syncer_workitem_pending 272 * syncer_worklist_len 273 * rushjob 274 */ 275 static struct mtx sync_mtx; 276 static struct cv sync_wakeup; 277 278 #define SYNCER_MAXDELAY 32 279 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 280 static int syncdelay = 30; /* max time to delay syncing data */ 281 static int filedelay = 30; /* time to delay syncing files */ 282 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 283 "Time to delay syncing files (in seconds)"); 284 static int dirdelay = 29; /* time to delay syncing directories */ 285 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 286 "Time to delay syncing directories (in seconds)"); 287 static int metadelay = 28; /* time to delay syncing metadata */ 288 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 289 "Time to delay syncing metadata (in seconds)"); 290 static int rushjob; /* number of slots to run ASAP */ 291 static int stat_rush_requests; /* number of times I/O speeded up */ 292 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 293 "Number of times I/O speeded up (rush requests)"); 294 295 #define VDBATCH_SIZE 8 296 struct vdbatch { 297 u_int index; 298 long freevnodes; 299 struct mtx lock; 300 struct vnode *tab[VDBATCH_SIZE]; 301 }; 302 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 303 304 static void vdbatch_dequeue(struct vnode *vp); 305 306 /* 307 * When shutting down the syncer, run it at four times normal speed. 308 */ 309 #define SYNCER_SHUTDOWN_SPEEDUP 4 310 static int sync_vnode_count; 311 static int syncer_worklist_len; 312 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 313 syncer_state; 314 315 /* Target for maximum number of vnodes. */ 316 u_long desiredvnodes; 317 static u_long gapvnodes; /* gap between wanted and desired */ 318 static u_long vhiwat; /* enough extras after expansion */ 319 static u_long vlowat; /* minimal extras before expansion */ 320 static u_long vstir; /* nonzero to stir non-free vnodes */ 321 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 322 323 static u_long vnlru_read_freevnodes(void); 324 325 /* 326 * Note that no attempt is made to sanitize these parameters. 327 */ 328 static int 329 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 330 { 331 u_long val; 332 int error; 333 334 val = desiredvnodes; 335 error = sysctl_handle_long(oidp, &val, 0, req); 336 if (error != 0 || req->newptr == NULL) 337 return (error); 338 339 if (val == desiredvnodes) 340 return (0); 341 mtx_lock(&vnode_list_mtx); 342 desiredvnodes = val; 343 wantfreevnodes = desiredvnodes / 4; 344 vnlru_recalc(); 345 mtx_unlock(&vnode_list_mtx); 346 /* 347 * XXX There is no protection against multiple threads changing 348 * desiredvnodes at the same time. Locking above only helps vnlru and 349 * getnewvnode. 350 */ 351 vfs_hash_changesize(desiredvnodes); 352 cache_changesize(desiredvnodes); 353 return (0); 354 } 355 356 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 357 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 358 "LU", "Target for maximum number of vnodes"); 359 360 static int 361 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 362 { 363 u_long val; 364 int error; 365 366 val = wantfreevnodes; 367 error = sysctl_handle_long(oidp, &val, 0, req); 368 if (error != 0 || req->newptr == NULL) 369 return (error); 370 371 if (val == wantfreevnodes) 372 return (0); 373 mtx_lock(&vnode_list_mtx); 374 wantfreevnodes = val; 375 vnlru_recalc(); 376 mtx_unlock(&vnode_list_mtx); 377 return (0); 378 } 379 380 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 381 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 382 "LU", "Target for minimum number of \"free\" vnodes"); 383 384 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 385 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 386 static int vnlru_nowhere; 387 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 388 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 389 390 static int 391 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 392 { 393 struct vnode *vp; 394 struct nameidata nd; 395 char *buf; 396 unsigned long ndflags; 397 int error; 398 399 if (req->newptr == NULL) 400 return (EINVAL); 401 if (req->newlen >= PATH_MAX) 402 return (E2BIG); 403 404 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 405 error = SYSCTL_IN(req, buf, req->newlen); 406 if (error != 0) 407 goto out; 408 409 buf[req->newlen] = '\0'; 410 411 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME; 412 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread); 413 if ((error = namei(&nd)) != 0) 414 goto out; 415 vp = nd.ni_vp; 416 417 if (VN_IS_DOOMED(vp)) { 418 /* 419 * This vnode is being recycled. Return != 0 to let the caller 420 * know that the sysctl had no effect. Return EAGAIN because a 421 * subsequent call will likely succeed (since namei will create 422 * a new vnode if necessary) 423 */ 424 error = EAGAIN; 425 goto putvnode; 426 } 427 428 counter_u64_add(recycles_count, 1); 429 vgone(vp); 430 putvnode: 431 NDFREE(&nd, 0); 432 out: 433 free(buf, M_TEMP); 434 return (error); 435 } 436 437 static int 438 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 439 { 440 struct thread *td = curthread; 441 struct vnode *vp; 442 struct file *fp; 443 int error; 444 int fd; 445 446 if (req->newptr == NULL) 447 return (EBADF); 448 449 error = sysctl_handle_int(oidp, &fd, 0, req); 450 if (error != 0) 451 return (error); 452 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 453 if (error != 0) 454 return (error); 455 vp = fp->f_vnode; 456 457 error = vn_lock(vp, LK_EXCLUSIVE); 458 if (error != 0) 459 goto drop; 460 461 counter_u64_add(recycles_count, 1); 462 vgone(vp); 463 VOP_UNLOCK(vp); 464 drop: 465 fdrop(fp, td); 466 return (error); 467 } 468 469 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 470 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 471 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 472 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 473 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 474 sysctl_ftry_reclaim_vnode, "I", 475 "Try to reclaim a vnode by its file descriptor"); 476 477 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 478 static int vnsz2log; 479 480 /* 481 * Support for the bufobj clean & dirty pctrie. 482 */ 483 static void * 484 buf_trie_alloc(struct pctrie *ptree) 485 { 486 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 487 } 488 489 static void 490 buf_trie_free(struct pctrie *ptree, void *node) 491 { 492 uma_zfree_smr(buf_trie_zone, node); 493 } 494 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 495 buf_trie_smr); 496 497 /* 498 * Initialize the vnode management data structures. 499 * 500 * Reevaluate the following cap on the number of vnodes after the physical 501 * memory size exceeds 512GB. In the limit, as the physical memory size 502 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 503 */ 504 #ifndef MAXVNODES_MAX 505 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 506 #endif 507 508 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 509 510 static struct vnode * 511 vn_alloc_marker(struct mount *mp) 512 { 513 struct vnode *vp; 514 515 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 516 vp->v_type = VMARKER; 517 vp->v_mount = mp; 518 519 return (vp); 520 } 521 522 static void 523 vn_free_marker(struct vnode *vp) 524 { 525 526 MPASS(vp->v_type == VMARKER); 527 free(vp, M_VNODE_MARKER); 528 } 529 530 /* 531 * Initialize a vnode as it first enters the zone. 532 */ 533 static int 534 vnode_init(void *mem, int size, int flags) 535 { 536 struct vnode *vp; 537 538 vp = mem; 539 bzero(vp, size); 540 /* 541 * Setup locks. 542 */ 543 vp->v_vnlock = &vp->v_lock; 544 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 545 /* 546 * By default, don't allow shared locks unless filesystems opt-in. 547 */ 548 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 549 LK_NOSHARE | LK_IS_VNODE); 550 /* 551 * Initialize bufobj. 552 */ 553 bufobj_init(&vp->v_bufobj, vp); 554 /* 555 * Initialize namecache. 556 */ 557 cache_vnode_init(vp); 558 /* 559 * Initialize rangelocks. 560 */ 561 rangelock_init(&vp->v_rl); 562 563 vp->v_dbatchcpu = NOCPU; 564 565 /* 566 * Check vhold_recycle_free for an explanation. 567 */ 568 vp->v_holdcnt = VHOLD_NO_SMR; 569 vp->v_type = VNON; 570 mtx_lock(&vnode_list_mtx); 571 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 572 mtx_unlock(&vnode_list_mtx); 573 return (0); 574 } 575 576 /* 577 * Free a vnode when it is cleared from the zone. 578 */ 579 static void 580 vnode_fini(void *mem, int size) 581 { 582 struct vnode *vp; 583 struct bufobj *bo; 584 585 vp = mem; 586 vdbatch_dequeue(vp); 587 mtx_lock(&vnode_list_mtx); 588 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 589 mtx_unlock(&vnode_list_mtx); 590 rangelock_destroy(&vp->v_rl); 591 lockdestroy(vp->v_vnlock); 592 mtx_destroy(&vp->v_interlock); 593 bo = &vp->v_bufobj; 594 rw_destroy(BO_LOCKPTR(bo)); 595 } 596 597 /* 598 * Provide the size of NFS nclnode and NFS fh for calculation of the 599 * vnode memory consumption. The size is specified directly to 600 * eliminate dependency on NFS-private header. 601 * 602 * Other filesystems may use bigger or smaller (like UFS and ZFS) 603 * private inode data, but the NFS-based estimation is ample enough. 604 * Still, we care about differences in the size between 64- and 32-bit 605 * platforms. 606 * 607 * Namecache structure size is heuristically 608 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 609 */ 610 #ifdef _LP64 611 #define NFS_NCLNODE_SZ (528 + 64) 612 #define NC_SZ 148 613 #else 614 #define NFS_NCLNODE_SZ (360 + 32) 615 #define NC_SZ 92 616 #endif 617 618 static void 619 vntblinit(void *dummy __unused) 620 { 621 struct vdbatch *vd; 622 int cpu, physvnodes, virtvnodes; 623 u_int i; 624 625 /* 626 * Desiredvnodes is a function of the physical memory size and the 627 * kernel's heap size. Generally speaking, it scales with the 628 * physical memory size. The ratio of desiredvnodes to the physical 629 * memory size is 1:16 until desiredvnodes exceeds 98,304. 630 * Thereafter, the 631 * marginal ratio of desiredvnodes to the physical memory size is 632 * 1:64. However, desiredvnodes is limited by the kernel's heap 633 * size. The memory required by desiredvnodes vnodes and vm objects 634 * must not exceed 1/10th of the kernel's heap size. 635 */ 636 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 637 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 638 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 639 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 640 desiredvnodes = min(physvnodes, virtvnodes); 641 if (desiredvnodes > MAXVNODES_MAX) { 642 if (bootverbose) 643 printf("Reducing kern.maxvnodes %lu -> %lu\n", 644 desiredvnodes, MAXVNODES_MAX); 645 desiredvnodes = MAXVNODES_MAX; 646 } 647 wantfreevnodes = desiredvnodes / 4; 648 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 649 TAILQ_INIT(&vnode_list); 650 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 651 /* 652 * The lock is taken to appease WITNESS. 653 */ 654 mtx_lock(&vnode_list_mtx); 655 vnlru_recalc(); 656 mtx_unlock(&vnode_list_mtx); 657 vnode_list_free_marker = vn_alloc_marker(NULL); 658 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 659 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 660 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 661 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 662 vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); 663 uma_zone_set_smr(vnode_zone, vfs_smr); 664 /* 665 * Preallocate enough nodes to support one-per buf so that 666 * we can not fail an insert. reassignbuf() callers can not 667 * tolerate the insertion failure. 668 */ 669 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 670 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 671 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 672 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 673 uma_prealloc(buf_trie_zone, nbuf); 674 675 vnodes_created = counter_u64_alloc(M_WAITOK); 676 recycles_count = counter_u64_alloc(M_WAITOK); 677 recycles_free_count = counter_u64_alloc(M_WAITOK); 678 deferred_inact = counter_u64_alloc(M_WAITOK); 679 680 /* 681 * Initialize the filesystem syncer. 682 */ 683 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 684 &syncer_mask); 685 syncer_maxdelay = syncer_mask + 1; 686 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 687 cv_init(&sync_wakeup, "syncer"); 688 for (i = 1; i <= sizeof(struct vnode); i <<= 1) 689 vnsz2log++; 690 vnsz2log--; 691 692 CPU_FOREACH(cpu) { 693 vd = DPCPU_ID_PTR((cpu), vd); 694 bzero(vd, sizeof(*vd)); 695 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 696 } 697 } 698 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 699 700 /* 701 * Mark a mount point as busy. Used to synchronize access and to delay 702 * unmounting. Eventually, mountlist_mtx is not released on failure. 703 * 704 * vfs_busy() is a custom lock, it can block the caller. 705 * vfs_busy() only sleeps if the unmount is active on the mount point. 706 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 707 * vnode belonging to mp. 708 * 709 * Lookup uses vfs_busy() to traverse mount points. 710 * root fs var fs 711 * / vnode lock A / vnode lock (/var) D 712 * /var vnode lock B /log vnode lock(/var/log) E 713 * vfs_busy lock C vfs_busy lock F 714 * 715 * Within each file system, the lock order is C->A->B and F->D->E. 716 * 717 * When traversing across mounts, the system follows that lock order: 718 * 719 * C->A->B 720 * | 721 * +->F->D->E 722 * 723 * The lookup() process for namei("/var") illustrates the process: 724 * VOP_LOOKUP() obtains B while A is held 725 * vfs_busy() obtains a shared lock on F while A and B are held 726 * vput() releases lock on B 727 * vput() releases lock on A 728 * VFS_ROOT() obtains lock on D while shared lock on F is held 729 * vfs_unbusy() releases shared lock on F 730 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 731 * Attempt to lock A (instead of vp_crossmp) while D is held would 732 * violate the global order, causing deadlocks. 733 * 734 * dounmount() locks B while F is drained. 735 */ 736 int 737 vfs_busy(struct mount *mp, int flags) 738 { 739 struct mount_pcpu *mpcpu; 740 741 MPASS((flags & ~MBF_MASK) == 0); 742 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 743 744 if (vfs_op_thread_enter(mp, mpcpu)) { 745 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 746 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 747 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 748 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 749 vfs_mp_count_add_pcpu(mpcpu, lockref, 1); 750 vfs_op_thread_exit(mp, mpcpu); 751 if (flags & MBF_MNTLSTLOCK) 752 mtx_unlock(&mountlist_mtx); 753 return (0); 754 } 755 756 MNT_ILOCK(mp); 757 vfs_assert_mount_counters(mp); 758 MNT_REF(mp); 759 /* 760 * If mount point is currently being unmounted, sleep until the 761 * mount point fate is decided. If thread doing the unmounting fails, 762 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 763 * that this mount point has survived the unmount attempt and vfs_busy 764 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 765 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 766 * about to be really destroyed. vfs_busy needs to release its 767 * reference on the mount point in this case and return with ENOENT, 768 * telling the caller that mount mount it tried to busy is no longer 769 * valid. 770 */ 771 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 772 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 773 MNT_REL(mp); 774 MNT_IUNLOCK(mp); 775 CTR1(KTR_VFS, "%s: failed busying before sleeping", 776 __func__); 777 return (ENOENT); 778 } 779 if (flags & MBF_MNTLSTLOCK) 780 mtx_unlock(&mountlist_mtx); 781 mp->mnt_kern_flag |= MNTK_MWAIT; 782 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 783 if (flags & MBF_MNTLSTLOCK) 784 mtx_lock(&mountlist_mtx); 785 MNT_ILOCK(mp); 786 } 787 if (flags & MBF_MNTLSTLOCK) 788 mtx_unlock(&mountlist_mtx); 789 mp->mnt_lockref++; 790 MNT_IUNLOCK(mp); 791 return (0); 792 } 793 794 /* 795 * Free a busy filesystem. 796 */ 797 void 798 vfs_unbusy(struct mount *mp) 799 { 800 struct mount_pcpu *mpcpu; 801 int c; 802 803 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 804 805 if (vfs_op_thread_enter(mp, mpcpu)) { 806 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 807 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); 808 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 809 vfs_op_thread_exit(mp, mpcpu); 810 return; 811 } 812 813 MNT_ILOCK(mp); 814 vfs_assert_mount_counters(mp); 815 MNT_REL(mp); 816 c = --mp->mnt_lockref; 817 if (mp->mnt_vfs_ops == 0) { 818 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 819 MNT_IUNLOCK(mp); 820 return; 821 } 822 if (c < 0) 823 vfs_dump_mount_counters(mp); 824 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 825 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 826 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 827 mp->mnt_kern_flag &= ~MNTK_DRAINING; 828 wakeup(&mp->mnt_lockref); 829 } 830 MNT_IUNLOCK(mp); 831 } 832 833 /* 834 * Lookup a mount point by filesystem identifier. 835 */ 836 struct mount * 837 vfs_getvfs(fsid_t *fsid) 838 { 839 struct mount *mp; 840 841 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 842 mtx_lock(&mountlist_mtx); 843 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 844 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 845 vfs_ref(mp); 846 mtx_unlock(&mountlist_mtx); 847 return (mp); 848 } 849 } 850 mtx_unlock(&mountlist_mtx); 851 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 852 return ((struct mount *) 0); 853 } 854 855 /* 856 * Lookup a mount point by filesystem identifier, busying it before 857 * returning. 858 * 859 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 860 * cache for popular filesystem identifiers. The cache is lockess, using 861 * the fact that struct mount's are never freed. In worst case we may 862 * get pointer to unmounted or even different filesystem, so we have to 863 * check what we got, and go slow way if so. 864 */ 865 struct mount * 866 vfs_busyfs(fsid_t *fsid) 867 { 868 #define FSID_CACHE_SIZE 256 869 typedef struct mount * volatile vmp_t; 870 static vmp_t cache[FSID_CACHE_SIZE]; 871 struct mount *mp; 872 int error; 873 uint32_t hash; 874 875 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 876 hash = fsid->val[0] ^ fsid->val[1]; 877 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 878 mp = cache[hash]; 879 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 880 goto slow; 881 if (vfs_busy(mp, 0) != 0) { 882 cache[hash] = NULL; 883 goto slow; 884 } 885 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 886 return (mp); 887 else 888 vfs_unbusy(mp); 889 890 slow: 891 mtx_lock(&mountlist_mtx); 892 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 893 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 894 error = vfs_busy(mp, MBF_MNTLSTLOCK); 895 if (error) { 896 cache[hash] = NULL; 897 mtx_unlock(&mountlist_mtx); 898 return (NULL); 899 } 900 cache[hash] = mp; 901 return (mp); 902 } 903 } 904 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 905 mtx_unlock(&mountlist_mtx); 906 return ((struct mount *) 0); 907 } 908 909 /* 910 * Check if a user can access privileged mount options. 911 */ 912 int 913 vfs_suser(struct mount *mp, struct thread *td) 914 { 915 int error; 916 917 if (jailed(td->td_ucred)) { 918 /* 919 * If the jail of the calling thread lacks permission for 920 * this type of file system, deny immediately. 921 */ 922 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 923 return (EPERM); 924 925 /* 926 * If the file system was mounted outside the jail of the 927 * calling thread, deny immediately. 928 */ 929 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 930 return (EPERM); 931 } 932 933 /* 934 * If file system supports delegated administration, we don't check 935 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 936 * by the file system itself. 937 * If this is not the user that did original mount, we check for 938 * the PRIV_VFS_MOUNT_OWNER privilege. 939 */ 940 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 941 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 942 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 943 return (error); 944 } 945 return (0); 946 } 947 948 /* 949 * Get a new unique fsid. Try to make its val[0] unique, since this value 950 * will be used to create fake device numbers for stat(). Also try (but 951 * not so hard) make its val[0] unique mod 2^16, since some emulators only 952 * support 16-bit device numbers. We end up with unique val[0]'s for the 953 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 954 * 955 * Keep in mind that several mounts may be running in parallel. Starting 956 * the search one past where the previous search terminated is both a 957 * micro-optimization and a defense against returning the same fsid to 958 * different mounts. 959 */ 960 void 961 vfs_getnewfsid(struct mount *mp) 962 { 963 static uint16_t mntid_base; 964 struct mount *nmp; 965 fsid_t tfsid; 966 int mtype; 967 968 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 969 mtx_lock(&mntid_mtx); 970 mtype = mp->mnt_vfc->vfc_typenum; 971 tfsid.val[1] = mtype; 972 mtype = (mtype & 0xFF) << 24; 973 for (;;) { 974 tfsid.val[0] = makedev(255, 975 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 976 mntid_base++; 977 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 978 break; 979 vfs_rel(nmp); 980 } 981 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 982 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 983 mtx_unlock(&mntid_mtx); 984 } 985 986 /* 987 * Knob to control the precision of file timestamps: 988 * 989 * 0 = seconds only; nanoseconds zeroed. 990 * 1 = seconds and nanoseconds, accurate within 1/HZ. 991 * 2 = seconds and nanoseconds, truncated to microseconds. 992 * >=3 = seconds and nanoseconds, maximum precision. 993 */ 994 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 995 996 static int timestamp_precision = TSP_USEC; 997 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 998 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 999 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 1000 "3+: sec + ns (max. precision))"); 1001 1002 /* 1003 * Get a current timestamp. 1004 */ 1005 void 1006 vfs_timestamp(struct timespec *tsp) 1007 { 1008 struct timeval tv; 1009 1010 switch (timestamp_precision) { 1011 case TSP_SEC: 1012 tsp->tv_sec = time_second; 1013 tsp->tv_nsec = 0; 1014 break; 1015 case TSP_HZ: 1016 getnanotime(tsp); 1017 break; 1018 case TSP_USEC: 1019 microtime(&tv); 1020 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1021 break; 1022 case TSP_NSEC: 1023 default: 1024 nanotime(tsp); 1025 break; 1026 } 1027 } 1028 1029 /* 1030 * Set vnode attributes to VNOVAL 1031 */ 1032 void 1033 vattr_null(struct vattr *vap) 1034 { 1035 1036 vap->va_type = VNON; 1037 vap->va_size = VNOVAL; 1038 vap->va_bytes = VNOVAL; 1039 vap->va_mode = VNOVAL; 1040 vap->va_nlink = VNOVAL; 1041 vap->va_uid = VNOVAL; 1042 vap->va_gid = VNOVAL; 1043 vap->va_fsid = VNOVAL; 1044 vap->va_fileid = VNOVAL; 1045 vap->va_blocksize = VNOVAL; 1046 vap->va_rdev = VNOVAL; 1047 vap->va_atime.tv_sec = VNOVAL; 1048 vap->va_atime.tv_nsec = VNOVAL; 1049 vap->va_mtime.tv_sec = VNOVAL; 1050 vap->va_mtime.tv_nsec = VNOVAL; 1051 vap->va_ctime.tv_sec = VNOVAL; 1052 vap->va_ctime.tv_nsec = VNOVAL; 1053 vap->va_birthtime.tv_sec = VNOVAL; 1054 vap->va_birthtime.tv_nsec = VNOVAL; 1055 vap->va_flags = VNOVAL; 1056 vap->va_gen = VNOVAL; 1057 vap->va_vaflags = 0; 1058 } 1059 1060 /* 1061 * Try to reduce the total number of vnodes. 1062 * 1063 * This routine (and its user) are buggy in at least the following ways: 1064 * - all parameters were picked years ago when RAM sizes were significantly 1065 * smaller 1066 * - it can pick vnodes based on pages used by the vm object, but filesystems 1067 * like ZFS don't use it making the pick broken 1068 * - since ZFS has its own aging policy it gets partially combated by this one 1069 * - a dedicated method should be provided for filesystems to let them decide 1070 * whether the vnode should be recycled 1071 * 1072 * This routine is called when we have too many vnodes. It attempts 1073 * to free <count> vnodes and will potentially free vnodes that still 1074 * have VM backing store (VM backing store is typically the cause 1075 * of a vnode blowout so we want to do this). Therefore, this operation 1076 * is not considered cheap. 1077 * 1078 * A number of conditions may prevent a vnode from being reclaimed. 1079 * the buffer cache may have references on the vnode, a directory 1080 * vnode may still have references due to the namei cache representing 1081 * underlying files, or the vnode may be in active use. It is not 1082 * desirable to reuse such vnodes. These conditions may cause the 1083 * number of vnodes to reach some minimum value regardless of what 1084 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1085 * 1086 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1087 * entries if this argument is strue 1088 * @param trigger Only reclaim vnodes with fewer than this many resident 1089 * pages. 1090 * @param target How many vnodes to reclaim. 1091 * @return The number of vnodes that were reclaimed. 1092 */ 1093 static int 1094 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1095 { 1096 struct vnode *vp, *mvp; 1097 struct mount *mp; 1098 struct vm_object *object; 1099 u_long done; 1100 bool retried; 1101 1102 mtx_assert(&vnode_list_mtx, MA_OWNED); 1103 1104 retried = false; 1105 done = 0; 1106 1107 mvp = vnode_list_reclaim_marker; 1108 restart: 1109 vp = mvp; 1110 while (done < target) { 1111 vp = TAILQ_NEXT(vp, v_vnodelist); 1112 if (__predict_false(vp == NULL)) 1113 break; 1114 1115 if (__predict_false(vp->v_type == VMARKER)) 1116 continue; 1117 1118 /* 1119 * If it's been deconstructed already, it's still 1120 * referenced, or it exceeds the trigger, skip it. 1121 * Also skip free vnodes. We are trying to make space 1122 * to expand the free list, not reduce it. 1123 */ 1124 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1125 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1126 goto next_iter; 1127 1128 if (vp->v_type == VBAD || vp->v_type == VNON) 1129 goto next_iter; 1130 1131 object = atomic_load_ptr(&vp->v_object); 1132 if (object == NULL || object->resident_page_count > trigger) { 1133 goto next_iter; 1134 } 1135 1136 /* 1137 * Handle races against vnode allocation. Filesystems lock the 1138 * vnode some time after it gets returned from getnewvnode, 1139 * despite type and hold count being manipulated earlier. 1140 * Resorting to checking v_mount restores guarantees present 1141 * before the global list was reworked to contain all vnodes. 1142 */ 1143 if (!VI_TRYLOCK(vp)) 1144 goto next_iter; 1145 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1146 VI_UNLOCK(vp); 1147 goto next_iter; 1148 } 1149 if (vp->v_mount == NULL) { 1150 VI_UNLOCK(vp); 1151 goto next_iter; 1152 } 1153 vholdl(vp); 1154 VI_UNLOCK(vp); 1155 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1156 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1157 mtx_unlock(&vnode_list_mtx); 1158 1159 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1160 vdrop(vp); 1161 goto next_iter_unlocked; 1162 } 1163 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1164 vdrop(vp); 1165 vn_finished_write(mp); 1166 goto next_iter_unlocked; 1167 } 1168 1169 VI_LOCK(vp); 1170 if (vp->v_usecount > 0 || 1171 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1172 (vp->v_object != NULL && 1173 vp->v_object->resident_page_count > trigger)) { 1174 VOP_UNLOCK(vp); 1175 vdropl(vp); 1176 vn_finished_write(mp); 1177 goto next_iter_unlocked; 1178 } 1179 counter_u64_add(recycles_count, 1); 1180 vgonel(vp); 1181 VOP_UNLOCK(vp); 1182 vdropl(vp); 1183 vn_finished_write(mp); 1184 done++; 1185 next_iter_unlocked: 1186 if (should_yield()) 1187 kern_yield(PRI_USER); 1188 mtx_lock(&vnode_list_mtx); 1189 goto restart; 1190 next_iter: 1191 MPASS(vp->v_type != VMARKER); 1192 if (!should_yield()) 1193 continue; 1194 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1195 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1196 mtx_unlock(&vnode_list_mtx); 1197 kern_yield(PRI_USER); 1198 mtx_lock(&vnode_list_mtx); 1199 goto restart; 1200 } 1201 if (done == 0 && !retried) { 1202 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1203 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1204 retried = true; 1205 goto restart; 1206 } 1207 return (done); 1208 } 1209 1210 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 1211 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 1212 0, 1213 "limit on vnode free requests per call to the vnlru_free routine"); 1214 1215 /* 1216 * Attempt to reduce the free list by the requested amount. 1217 */ 1218 static int 1219 vnlru_free_locked(int count, struct vfsops *mnt_op) 1220 { 1221 struct vnode *vp, *mvp; 1222 struct mount *mp; 1223 int ocount; 1224 1225 mtx_assert(&vnode_list_mtx, MA_OWNED); 1226 if (count > max_vnlru_free) 1227 count = max_vnlru_free; 1228 ocount = count; 1229 mvp = vnode_list_free_marker; 1230 vp = mvp; 1231 for (;;) { 1232 if (count == 0) { 1233 break; 1234 } 1235 vp = TAILQ_NEXT(vp, v_vnodelist); 1236 if (__predict_false(vp == NULL)) { 1237 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1238 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1239 break; 1240 } 1241 if (__predict_false(vp->v_type == VMARKER)) 1242 continue; 1243 if (vp->v_holdcnt > 0) 1244 continue; 1245 /* 1246 * Don't recycle if our vnode is from different type 1247 * of mount point. Note that mp is type-safe, the 1248 * check does not reach unmapped address even if 1249 * vnode is reclaimed. 1250 */ 1251 if (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1252 mp->mnt_op != mnt_op) { 1253 continue; 1254 } 1255 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1256 continue; 1257 } 1258 if (!vhold_recycle_free(vp)) 1259 continue; 1260 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1261 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1262 mtx_unlock(&vnode_list_mtx); 1263 if (vtryrecycle(vp) == 0) 1264 count--; 1265 mtx_lock(&vnode_list_mtx); 1266 vp = mvp; 1267 } 1268 return (ocount - count); 1269 } 1270 1271 void 1272 vnlru_free(int count, struct vfsops *mnt_op) 1273 { 1274 1275 mtx_lock(&vnode_list_mtx); 1276 vnlru_free_locked(count, mnt_op); 1277 mtx_unlock(&vnode_list_mtx); 1278 } 1279 1280 static void 1281 vnlru_recalc(void) 1282 { 1283 1284 mtx_assert(&vnode_list_mtx, MA_OWNED); 1285 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1286 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1287 vlowat = vhiwat / 2; 1288 } 1289 1290 /* 1291 * Attempt to recycle vnodes in a context that is always safe to block. 1292 * Calling vlrurecycle() from the bowels of filesystem code has some 1293 * interesting deadlock problems. 1294 */ 1295 static struct proc *vnlruproc; 1296 static int vnlruproc_sig; 1297 1298 /* 1299 * The main freevnodes counter is only updated when threads requeue their vnode 1300 * batches. CPUs are conditionally walked to compute a more accurate total. 1301 * 1302 * Limit how much of a slop are we willing to tolerate. Note: the actual value 1303 * at any given moment can still exceed slop, but it should not be by significant 1304 * margin in practice. 1305 */ 1306 #define VNLRU_FREEVNODES_SLOP 128 1307 1308 static __inline void 1309 vn_freevnodes_inc(void) 1310 { 1311 struct vdbatch *vd; 1312 1313 critical_enter(); 1314 vd = DPCPU_PTR(vd); 1315 vd->freevnodes++; 1316 critical_exit(); 1317 } 1318 1319 static __inline void 1320 vn_freevnodes_dec(void) 1321 { 1322 struct vdbatch *vd; 1323 1324 critical_enter(); 1325 vd = DPCPU_PTR(vd); 1326 vd->freevnodes--; 1327 critical_exit(); 1328 } 1329 1330 static u_long 1331 vnlru_read_freevnodes(void) 1332 { 1333 struct vdbatch *vd; 1334 long slop; 1335 int cpu; 1336 1337 mtx_assert(&vnode_list_mtx, MA_OWNED); 1338 if (freevnodes > freevnodes_old) 1339 slop = freevnodes - freevnodes_old; 1340 else 1341 slop = freevnodes_old - freevnodes; 1342 if (slop < VNLRU_FREEVNODES_SLOP) 1343 return (freevnodes >= 0 ? freevnodes : 0); 1344 freevnodes_old = freevnodes; 1345 CPU_FOREACH(cpu) { 1346 vd = DPCPU_ID_PTR((cpu), vd); 1347 freevnodes_old += vd->freevnodes; 1348 } 1349 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1350 } 1351 1352 static bool 1353 vnlru_under(u_long rnumvnodes, u_long limit) 1354 { 1355 u_long rfreevnodes, space; 1356 1357 if (__predict_false(rnumvnodes > desiredvnodes)) 1358 return (true); 1359 1360 space = desiredvnodes - rnumvnodes; 1361 if (space < limit) { 1362 rfreevnodes = vnlru_read_freevnodes(); 1363 if (rfreevnodes > wantfreevnodes) 1364 space += rfreevnodes - wantfreevnodes; 1365 } 1366 return (space < limit); 1367 } 1368 1369 static bool 1370 vnlru_under_unlocked(u_long rnumvnodes, u_long limit) 1371 { 1372 long rfreevnodes, space; 1373 1374 if (__predict_false(rnumvnodes > desiredvnodes)) 1375 return (true); 1376 1377 space = desiredvnodes - rnumvnodes; 1378 if (space < limit) { 1379 rfreevnodes = atomic_load_long(&freevnodes); 1380 if (rfreevnodes > wantfreevnodes) 1381 space += rfreevnodes - wantfreevnodes; 1382 } 1383 return (space < limit); 1384 } 1385 1386 static void 1387 vnlru_kick(void) 1388 { 1389 1390 mtx_assert(&vnode_list_mtx, MA_OWNED); 1391 if (vnlruproc_sig == 0) { 1392 vnlruproc_sig = 1; 1393 wakeup(vnlruproc); 1394 } 1395 } 1396 1397 static void 1398 vnlru_proc(void) 1399 { 1400 u_long rnumvnodes, rfreevnodes, target; 1401 unsigned long onumvnodes; 1402 int done, force, trigger, usevnodes; 1403 bool reclaim_nc_src, want_reread; 1404 1405 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1406 SHUTDOWN_PRI_FIRST); 1407 1408 force = 0; 1409 want_reread = false; 1410 for (;;) { 1411 kproc_suspend_check(vnlruproc); 1412 mtx_lock(&vnode_list_mtx); 1413 rnumvnodes = atomic_load_long(&numvnodes); 1414 1415 if (want_reread) { 1416 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1417 want_reread = false; 1418 } 1419 1420 /* 1421 * If numvnodes is too large (due to desiredvnodes being 1422 * adjusted using its sysctl, or emergency growth), first 1423 * try to reduce it by discarding from the free list. 1424 */ 1425 if (rnumvnodes > desiredvnodes) { 1426 vnlru_free_locked(rnumvnodes - desiredvnodes, NULL); 1427 rnumvnodes = atomic_load_long(&numvnodes); 1428 } 1429 /* 1430 * Sleep if the vnode cache is in a good state. This is 1431 * when it is not over-full and has space for about a 4% 1432 * or 9% expansion (by growing its size or inexcessively 1433 * reducing its free list). Otherwise, try to reclaim 1434 * space for a 10% expansion. 1435 */ 1436 if (vstir && force == 0) { 1437 force = 1; 1438 vstir = 0; 1439 } 1440 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1441 vnlruproc_sig = 0; 1442 wakeup(&vnlruproc_sig); 1443 msleep(vnlruproc, &vnode_list_mtx, 1444 PVFS|PDROP, "vlruwt", hz); 1445 continue; 1446 } 1447 rfreevnodes = vnlru_read_freevnodes(); 1448 1449 onumvnodes = rnumvnodes; 1450 /* 1451 * Calculate parameters for recycling. These are the same 1452 * throughout the loop to give some semblance of fairness. 1453 * The trigger point is to avoid recycling vnodes with lots 1454 * of resident pages. We aren't trying to free memory; we 1455 * are trying to recycle or at least free vnodes. 1456 */ 1457 if (rnumvnodes <= desiredvnodes) 1458 usevnodes = rnumvnodes - rfreevnodes; 1459 else 1460 usevnodes = rnumvnodes; 1461 if (usevnodes <= 0) 1462 usevnodes = 1; 1463 /* 1464 * The trigger value is is chosen to give a conservatively 1465 * large value to ensure that it alone doesn't prevent 1466 * making progress. The value can easily be so large that 1467 * it is effectively infinite in some congested and 1468 * misconfigured cases, and this is necessary. Normally 1469 * it is about 8 to 100 (pages), which is quite large. 1470 */ 1471 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1472 if (force < 2) 1473 trigger = vsmalltrigger; 1474 reclaim_nc_src = force >= 3; 1475 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1476 target = target / 10 + 1; 1477 done = vlrureclaim(reclaim_nc_src, trigger, target); 1478 mtx_unlock(&vnode_list_mtx); 1479 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1480 uma_reclaim(UMA_RECLAIM_DRAIN); 1481 if (done == 0) { 1482 if (force == 0 || force == 1) { 1483 force = 2; 1484 continue; 1485 } 1486 if (force == 2) { 1487 force = 3; 1488 continue; 1489 } 1490 want_reread = true; 1491 force = 0; 1492 vnlru_nowhere++; 1493 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1494 } else { 1495 want_reread = true; 1496 kern_yield(PRI_USER); 1497 } 1498 } 1499 } 1500 1501 static struct kproc_desc vnlru_kp = { 1502 "vnlru", 1503 vnlru_proc, 1504 &vnlruproc 1505 }; 1506 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1507 &vnlru_kp); 1508 1509 /* 1510 * Routines having to do with the management of the vnode table. 1511 */ 1512 1513 /* 1514 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1515 * before we actually vgone(). This function must be called with the vnode 1516 * held to prevent the vnode from being returned to the free list midway 1517 * through vgone(). 1518 */ 1519 static int 1520 vtryrecycle(struct vnode *vp) 1521 { 1522 struct mount *vnmp; 1523 1524 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1525 VNASSERT(vp->v_holdcnt, vp, 1526 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1527 /* 1528 * This vnode may found and locked via some other list, if so we 1529 * can't recycle it yet. 1530 */ 1531 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1532 CTR2(KTR_VFS, 1533 "%s: impossible to recycle, vp %p lock is already held", 1534 __func__, vp); 1535 vdrop(vp); 1536 return (EWOULDBLOCK); 1537 } 1538 /* 1539 * Don't recycle if its filesystem is being suspended. 1540 */ 1541 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1542 VOP_UNLOCK(vp); 1543 CTR2(KTR_VFS, 1544 "%s: impossible to recycle, cannot start the write for %p", 1545 __func__, vp); 1546 vdrop(vp); 1547 return (EBUSY); 1548 } 1549 /* 1550 * If we got this far, we need to acquire the interlock and see if 1551 * anyone picked up this vnode from another list. If not, we will 1552 * mark it with DOOMED via vgonel() so that anyone who does find it 1553 * will skip over it. 1554 */ 1555 VI_LOCK(vp); 1556 if (vp->v_usecount) { 1557 VOP_UNLOCK(vp); 1558 vdropl(vp); 1559 vn_finished_write(vnmp); 1560 CTR2(KTR_VFS, 1561 "%s: impossible to recycle, %p is already referenced", 1562 __func__, vp); 1563 return (EBUSY); 1564 } 1565 if (!VN_IS_DOOMED(vp)) { 1566 counter_u64_add(recycles_free_count, 1); 1567 vgonel(vp); 1568 } 1569 VOP_UNLOCK(vp); 1570 vdropl(vp); 1571 vn_finished_write(vnmp); 1572 return (0); 1573 } 1574 1575 /* 1576 * Allocate a new vnode. 1577 * 1578 * The operation never returns an error. Returning an error was disabled 1579 * in r145385 (dated 2005) with the following comment: 1580 * 1581 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1582 * 1583 * Given the age of this commit (almost 15 years at the time of writing this 1584 * comment) restoring the ability to fail requires a significant audit of 1585 * all codepaths. 1586 * 1587 * The routine can try to free a vnode or stall for up to 1 second waiting for 1588 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1589 */ 1590 static u_long vn_alloc_cyclecount; 1591 1592 static struct vnode * __noinline 1593 vn_alloc_hard(struct mount *mp) 1594 { 1595 u_long rnumvnodes, rfreevnodes; 1596 1597 mtx_lock(&vnode_list_mtx); 1598 rnumvnodes = atomic_load_long(&numvnodes); 1599 if (rnumvnodes + 1 < desiredvnodes) { 1600 vn_alloc_cyclecount = 0; 1601 goto alloc; 1602 } 1603 rfreevnodes = vnlru_read_freevnodes(); 1604 if (vn_alloc_cyclecount++ >= rfreevnodes) { 1605 vn_alloc_cyclecount = 0; 1606 vstir = 1; 1607 } 1608 /* 1609 * Grow the vnode cache if it will not be above its target max 1610 * after growing. Otherwise, if the free list is nonempty, try 1611 * to reclaim 1 item from it before growing the cache (possibly 1612 * above its target max if the reclamation failed or is delayed). 1613 * Otherwise, wait for some space. In all cases, schedule 1614 * vnlru_proc() if we are getting short of space. The watermarks 1615 * should be chosen so that we never wait or even reclaim from 1616 * the free list to below its target minimum. 1617 */ 1618 if (vnlru_free_locked(1, NULL) > 0) 1619 goto alloc; 1620 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 1621 /* 1622 * Wait for space for a new vnode. 1623 */ 1624 vnlru_kick(); 1625 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 1626 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 1627 vnlru_read_freevnodes() > 1) 1628 vnlru_free_locked(1, NULL); 1629 } 1630 alloc: 1631 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1632 if (vnlru_under(rnumvnodes, vlowat)) 1633 vnlru_kick(); 1634 mtx_unlock(&vnode_list_mtx); 1635 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1636 } 1637 1638 static struct vnode * 1639 vn_alloc(struct mount *mp) 1640 { 1641 u_long rnumvnodes; 1642 1643 if (__predict_false(vn_alloc_cyclecount != 0)) 1644 return (vn_alloc_hard(mp)); 1645 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1646 if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { 1647 atomic_subtract_long(&numvnodes, 1); 1648 return (vn_alloc_hard(mp)); 1649 } 1650 1651 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1652 } 1653 1654 static void 1655 vn_free(struct vnode *vp) 1656 { 1657 1658 atomic_subtract_long(&numvnodes, 1); 1659 uma_zfree_smr(vnode_zone, vp); 1660 } 1661 1662 /* 1663 * Return the next vnode from the free list. 1664 */ 1665 int 1666 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1667 struct vnode **vpp) 1668 { 1669 struct vnode *vp; 1670 struct thread *td; 1671 struct lock_object *lo; 1672 1673 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1674 1675 KASSERT(vops->registered, 1676 ("%s: not registered vector op %p\n", __func__, vops)); 1677 1678 td = curthread; 1679 if (td->td_vp_reserved != NULL) { 1680 vp = td->td_vp_reserved; 1681 td->td_vp_reserved = NULL; 1682 } else { 1683 vp = vn_alloc(mp); 1684 } 1685 counter_u64_add(vnodes_created, 1); 1686 /* 1687 * Locks are given the generic name "vnode" when created. 1688 * Follow the historic practice of using the filesystem 1689 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1690 * 1691 * Locks live in a witness group keyed on their name. Thus, 1692 * when a lock is renamed, it must also move from the witness 1693 * group of its old name to the witness group of its new name. 1694 * 1695 * The change only needs to be made when the vnode moves 1696 * from one filesystem type to another. We ensure that each 1697 * filesystem use a single static name pointer for its tag so 1698 * that we can compare pointers rather than doing a strcmp(). 1699 */ 1700 lo = &vp->v_vnlock->lock_object; 1701 #ifdef WITNESS 1702 if (lo->lo_name != tag) { 1703 #endif 1704 lo->lo_name = tag; 1705 #ifdef WITNESS 1706 WITNESS_DESTROY(lo); 1707 WITNESS_INIT(lo, tag); 1708 } 1709 #endif 1710 /* 1711 * By default, don't allow shared locks unless filesystems opt-in. 1712 */ 1713 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1714 /* 1715 * Finalize various vnode identity bits. 1716 */ 1717 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1718 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1719 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1720 vp->v_type = VNON; 1721 vp->v_op = vops; 1722 vp->v_irflag = 0; 1723 v_init_counters(vp); 1724 vn_seqc_init(vp); 1725 vp->v_bufobj.bo_ops = &buf_ops_bio; 1726 #ifdef DIAGNOSTIC 1727 if (mp == NULL && vops != &dead_vnodeops) 1728 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 1729 #endif 1730 #ifdef MAC 1731 mac_vnode_init(vp); 1732 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1733 mac_vnode_associate_singlelabel(mp, vp); 1734 #endif 1735 if (mp != NULL) { 1736 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1737 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1738 vp->v_vflag |= VV_NOKNOTE; 1739 } 1740 1741 /* 1742 * For the filesystems which do not use vfs_hash_insert(), 1743 * still initialize v_hash to have vfs_hash_index() useful. 1744 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1745 * its own hashing. 1746 */ 1747 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1748 1749 *vpp = vp; 1750 return (0); 1751 } 1752 1753 void 1754 getnewvnode_reserve(void) 1755 { 1756 struct thread *td; 1757 1758 td = curthread; 1759 MPASS(td->td_vp_reserved == NULL); 1760 td->td_vp_reserved = vn_alloc(NULL); 1761 } 1762 1763 void 1764 getnewvnode_drop_reserve(void) 1765 { 1766 struct thread *td; 1767 1768 td = curthread; 1769 if (td->td_vp_reserved != NULL) { 1770 vn_free(td->td_vp_reserved); 1771 td->td_vp_reserved = NULL; 1772 } 1773 } 1774 1775 static void __noinline 1776 freevnode(struct vnode *vp) 1777 { 1778 struct bufobj *bo; 1779 1780 /* 1781 * The vnode has been marked for destruction, so free it. 1782 * 1783 * The vnode will be returned to the zone where it will 1784 * normally remain until it is needed for another vnode. We 1785 * need to cleanup (or verify that the cleanup has already 1786 * been done) any residual data left from its current use 1787 * so as not to contaminate the freshly allocated vnode. 1788 */ 1789 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 1790 /* 1791 * Paired with vgone. 1792 */ 1793 vn_seqc_write_end_free(vp); 1794 1795 bo = &vp->v_bufobj; 1796 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 1797 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 1798 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 1799 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 1800 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 1801 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 1802 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 1803 ("clean blk trie not empty")); 1804 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 1805 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 1806 ("dirty blk trie not empty")); 1807 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 1808 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 1809 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 1810 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 1811 ("Dangling rangelock waiters")); 1812 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, 1813 ("Leaked inactivation")); 1814 VI_UNLOCK(vp); 1815 #ifdef MAC 1816 mac_vnode_destroy(vp); 1817 #endif 1818 if (vp->v_pollinfo != NULL) { 1819 destroy_vpollinfo(vp->v_pollinfo); 1820 vp->v_pollinfo = NULL; 1821 } 1822 vp->v_mountedhere = NULL; 1823 vp->v_unpcb = NULL; 1824 vp->v_rdev = NULL; 1825 vp->v_fifoinfo = NULL; 1826 vp->v_iflag = 0; 1827 vp->v_vflag = 0; 1828 bo->bo_flag = 0; 1829 vn_free(vp); 1830 } 1831 1832 /* 1833 * Delete from old mount point vnode list, if on one. 1834 */ 1835 static void 1836 delmntque(struct vnode *vp) 1837 { 1838 struct mount *mp; 1839 1840 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 1841 1842 mp = vp->v_mount; 1843 if (mp == NULL) 1844 return; 1845 MNT_ILOCK(mp); 1846 VI_LOCK(vp); 1847 vp->v_mount = NULL; 1848 VI_UNLOCK(vp); 1849 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1850 ("bad mount point vnode list size")); 1851 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1852 mp->mnt_nvnodelistsize--; 1853 MNT_REL(mp); 1854 MNT_IUNLOCK(mp); 1855 } 1856 1857 static void 1858 insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1859 { 1860 1861 vp->v_data = NULL; 1862 vp->v_op = &dead_vnodeops; 1863 vgone(vp); 1864 vput(vp); 1865 } 1866 1867 /* 1868 * Insert into list of vnodes for the new mount point, if available. 1869 */ 1870 int 1871 insmntque1(struct vnode *vp, struct mount *mp, 1872 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1873 { 1874 1875 KASSERT(vp->v_mount == NULL, 1876 ("insmntque: vnode already on per mount vnode list")); 1877 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1878 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1879 1880 /* 1881 * We acquire the vnode interlock early to ensure that the 1882 * vnode cannot be recycled by another process releasing a 1883 * holdcnt on it before we get it on both the vnode list 1884 * and the active vnode list. The mount mutex protects only 1885 * manipulation of the vnode list and the vnode freelist 1886 * mutex protects only manipulation of the active vnode list. 1887 * Hence the need to hold the vnode interlock throughout. 1888 */ 1889 MNT_ILOCK(mp); 1890 VI_LOCK(vp); 1891 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 1892 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1893 mp->mnt_nvnodelistsize == 0)) && 1894 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 1895 VI_UNLOCK(vp); 1896 MNT_IUNLOCK(mp); 1897 if (dtr != NULL) 1898 dtr(vp, dtr_arg); 1899 return (EBUSY); 1900 } 1901 vp->v_mount = mp; 1902 MNT_REF(mp); 1903 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1904 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1905 ("neg mount point vnode list size")); 1906 mp->mnt_nvnodelistsize++; 1907 VI_UNLOCK(vp); 1908 MNT_IUNLOCK(mp); 1909 return (0); 1910 } 1911 1912 int 1913 insmntque(struct vnode *vp, struct mount *mp) 1914 { 1915 1916 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1917 } 1918 1919 /* 1920 * Flush out and invalidate all buffers associated with a bufobj 1921 * Called with the underlying object locked. 1922 */ 1923 int 1924 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1925 { 1926 int error; 1927 1928 BO_LOCK(bo); 1929 if (flags & V_SAVE) { 1930 error = bufobj_wwait(bo, slpflag, slptimeo); 1931 if (error) { 1932 BO_UNLOCK(bo); 1933 return (error); 1934 } 1935 if (bo->bo_dirty.bv_cnt > 0) { 1936 BO_UNLOCK(bo); 1937 do { 1938 error = BO_SYNC(bo, MNT_WAIT); 1939 } while (error == ERELOOKUP); 1940 if (error != 0) 1941 return (error); 1942 /* 1943 * XXX We could save a lock/unlock if this was only 1944 * enabled under INVARIANTS 1945 */ 1946 BO_LOCK(bo); 1947 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1948 panic("vinvalbuf: dirty bufs"); 1949 } 1950 } 1951 /* 1952 * If you alter this loop please notice that interlock is dropped and 1953 * reacquired in flushbuflist. Special care is needed to ensure that 1954 * no race conditions occur from this. 1955 */ 1956 do { 1957 error = flushbuflist(&bo->bo_clean, 1958 flags, bo, slpflag, slptimeo); 1959 if (error == 0 && !(flags & V_CLEANONLY)) 1960 error = flushbuflist(&bo->bo_dirty, 1961 flags, bo, slpflag, slptimeo); 1962 if (error != 0 && error != EAGAIN) { 1963 BO_UNLOCK(bo); 1964 return (error); 1965 } 1966 } while (error != 0); 1967 1968 /* 1969 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1970 * have write I/O in-progress but if there is a VM object then the 1971 * VM object can also have read-I/O in-progress. 1972 */ 1973 do { 1974 bufobj_wwait(bo, 0, 0); 1975 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 1976 BO_UNLOCK(bo); 1977 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 1978 BO_LOCK(bo); 1979 } 1980 } while (bo->bo_numoutput > 0); 1981 BO_UNLOCK(bo); 1982 1983 /* 1984 * Destroy the copy in the VM cache, too. 1985 */ 1986 if (bo->bo_object != NULL && 1987 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 1988 VM_OBJECT_WLOCK(bo->bo_object); 1989 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1990 OBJPR_CLEANONLY : 0); 1991 VM_OBJECT_WUNLOCK(bo->bo_object); 1992 } 1993 1994 #ifdef INVARIANTS 1995 BO_LOCK(bo); 1996 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 1997 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 1998 bo->bo_clean.bv_cnt > 0)) 1999 panic("vinvalbuf: flush failed"); 2000 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 2001 bo->bo_dirty.bv_cnt > 0) 2002 panic("vinvalbuf: flush dirty failed"); 2003 BO_UNLOCK(bo); 2004 #endif 2005 return (0); 2006 } 2007 2008 /* 2009 * Flush out and invalidate all buffers associated with a vnode. 2010 * Called with the underlying object locked. 2011 */ 2012 int 2013 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 2014 { 2015 2016 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2017 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 2018 if (vp->v_object != NULL && vp->v_object->handle != vp) 2019 return (0); 2020 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 2021 } 2022 2023 /* 2024 * Flush out buffers on the specified list. 2025 * 2026 */ 2027 static int 2028 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 2029 int slptimeo) 2030 { 2031 struct buf *bp, *nbp; 2032 int retval, error; 2033 daddr_t lblkno; 2034 b_xflags_t xflags; 2035 2036 ASSERT_BO_WLOCKED(bo); 2037 2038 retval = 0; 2039 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2040 /* 2041 * If we are flushing both V_NORMAL and V_ALT buffers then 2042 * do not skip any buffers. If we are flushing only V_NORMAL 2043 * buffers then skip buffers marked as BX_ALTDATA. If we are 2044 * flushing only V_ALT buffers then skip buffers not marked 2045 * as BX_ALTDATA. 2046 */ 2047 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2048 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2049 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2050 continue; 2051 } 2052 if (nbp != NULL) { 2053 lblkno = nbp->b_lblkno; 2054 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2055 } 2056 retval = EAGAIN; 2057 error = BUF_TIMELOCK(bp, 2058 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2059 "flushbuf", slpflag, slptimeo); 2060 if (error) { 2061 BO_LOCK(bo); 2062 return (error != ENOLCK ? error : EAGAIN); 2063 } 2064 KASSERT(bp->b_bufobj == bo, 2065 ("bp %p wrong b_bufobj %p should be %p", 2066 bp, bp->b_bufobj, bo)); 2067 /* 2068 * XXX Since there are no node locks for NFS, I 2069 * believe there is a slight chance that a delayed 2070 * write will occur while sleeping just above, so 2071 * check for it. 2072 */ 2073 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2074 (flags & V_SAVE)) { 2075 bremfree(bp); 2076 bp->b_flags |= B_ASYNC; 2077 bwrite(bp); 2078 BO_LOCK(bo); 2079 return (EAGAIN); /* XXX: why not loop ? */ 2080 } 2081 bremfree(bp); 2082 bp->b_flags |= (B_INVAL | B_RELBUF); 2083 bp->b_flags &= ~B_ASYNC; 2084 brelse(bp); 2085 BO_LOCK(bo); 2086 if (nbp == NULL) 2087 break; 2088 nbp = gbincore(bo, lblkno); 2089 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2090 != xflags) 2091 break; /* nbp invalid */ 2092 } 2093 return (retval); 2094 } 2095 2096 int 2097 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2098 { 2099 struct buf *bp; 2100 int error; 2101 daddr_t lblkno; 2102 2103 ASSERT_BO_LOCKED(bo); 2104 2105 for (lblkno = startn;;) { 2106 again: 2107 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 2108 if (bp == NULL || bp->b_lblkno >= endn || 2109 bp->b_lblkno < startn) 2110 break; 2111 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2112 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2113 if (error != 0) { 2114 BO_RLOCK(bo); 2115 if (error == ENOLCK) 2116 goto again; 2117 return (error); 2118 } 2119 KASSERT(bp->b_bufobj == bo, 2120 ("bp %p wrong b_bufobj %p should be %p", 2121 bp, bp->b_bufobj, bo)); 2122 lblkno = bp->b_lblkno + 1; 2123 if ((bp->b_flags & B_MANAGED) == 0) 2124 bremfree(bp); 2125 bp->b_flags |= B_RELBUF; 2126 /* 2127 * In the VMIO case, use the B_NOREUSE flag to hint that the 2128 * pages backing each buffer in the range are unlikely to be 2129 * reused. Dirty buffers will have the hint applied once 2130 * they've been written. 2131 */ 2132 if ((bp->b_flags & B_VMIO) != 0) 2133 bp->b_flags |= B_NOREUSE; 2134 brelse(bp); 2135 BO_RLOCK(bo); 2136 } 2137 return (0); 2138 } 2139 2140 /* 2141 * Truncate a file's buffer and pages to a specified length. This 2142 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2143 * sync activity. 2144 */ 2145 int 2146 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2147 { 2148 struct buf *bp, *nbp; 2149 struct bufobj *bo; 2150 daddr_t startlbn; 2151 2152 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2153 vp, blksize, (uintmax_t)length); 2154 2155 /* 2156 * Round up to the *next* lbn. 2157 */ 2158 startlbn = howmany(length, blksize); 2159 2160 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2161 2162 bo = &vp->v_bufobj; 2163 restart_unlocked: 2164 BO_LOCK(bo); 2165 2166 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2167 ; 2168 2169 if (length > 0) { 2170 restartsync: 2171 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2172 if (bp->b_lblkno > 0) 2173 continue; 2174 /* 2175 * Since we hold the vnode lock this should only 2176 * fail if we're racing with the buf daemon. 2177 */ 2178 if (BUF_LOCK(bp, 2179 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2180 BO_LOCKPTR(bo)) == ENOLCK) 2181 goto restart_unlocked; 2182 2183 VNASSERT((bp->b_flags & B_DELWRI), vp, 2184 ("buf(%p) on dirty queue without DELWRI", bp)); 2185 2186 bremfree(bp); 2187 bawrite(bp); 2188 BO_LOCK(bo); 2189 goto restartsync; 2190 } 2191 } 2192 2193 bufobj_wwait(bo, 0, 0); 2194 BO_UNLOCK(bo); 2195 vnode_pager_setsize(vp, length); 2196 2197 return (0); 2198 } 2199 2200 /* 2201 * Invalidate the cached pages of a file's buffer within the range of block 2202 * numbers [startlbn, endlbn). 2203 */ 2204 void 2205 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2206 int blksize) 2207 { 2208 struct bufobj *bo; 2209 off_t start, end; 2210 2211 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2212 2213 start = blksize * startlbn; 2214 end = blksize * endlbn; 2215 2216 bo = &vp->v_bufobj; 2217 BO_LOCK(bo); 2218 MPASS(blksize == bo->bo_bsize); 2219 2220 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2221 ; 2222 2223 BO_UNLOCK(bo); 2224 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2225 } 2226 2227 static int 2228 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2229 daddr_t startlbn, daddr_t endlbn) 2230 { 2231 struct buf *bp, *nbp; 2232 bool anyfreed; 2233 2234 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2235 ASSERT_BO_LOCKED(bo); 2236 2237 do { 2238 anyfreed = false; 2239 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 2240 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2241 continue; 2242 if (BUF_LOCK(bp, 2243 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2244 BO_LOCKPTR(bo)) == ENOLCK) { 2245 BO_LOCK(bo); 2246 return (EAGAIN); 2247 } 2248 2249 bremfree(bp); 2250 bp->b_flags |= B_INVAL | B_RELBUF; 2251 bp->b_flags &= ~B_ASYNC; 2252 brelse(bp); 2253 anyfreed = true; 2254 2255 BO_LOCK(bo); 2256 if (nbp != NULL && 2257 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 2258 nbp->b_vp != vp || 2259 (nbp->b_flags & B_DELWRI) != 0)) 2260 return (EAGAIN); 2261 } 2262 2263 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2264 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2265 continue; 2266 if (BUF_LOCK(bp, 2267 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2268 BO_LOCKPTR(bo)) == ENOLCK) { 2269 BO_LOCK(bo); 2270 return (EAGAIN); 2271 } 2272 bremfree(bp); 2273 bp->b_flags |= B_INVAL | B_RELBUF; 2274 bp->b_flags &= ~B_ASYNC; 2275 brelse(bp); 2276 anyfreed = true; 2277 2278 BO_LOCK(bo); 2279 if (nbp != NULL && 2280 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 2281 (nbp->b_vp != vp) || 2282 (nbp->b_flags & B_DELWRI) == 0)) 2283 return (EAGAIN); 2284 } 2285 } while (anyfreed); 2286 return (0); 2287 } 2288 2289 static void 2290 buf_vlist_remove(struct buf *bp) 2291 { 2292 struct bufv *bv; 2293 b_xflags_t flags; 2294 2295 flags = bp->b_xflags; 2296 2297 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2298 ASSERT_BO_WLOCKED(bp->b_bufobj); 2299 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2300 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2301 ("%s: buffer %p has invalid queue state", __func__, bp)); 2302 2303 if ((flags & BX_VNDIRTY) != 0) 2304 bv = &bp->b_bufobj->bo_dirty; 2305 else 2306 bv = &bp->b_bufobj->bo_clean; 2307 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2308 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2309 bv->bv_cnt--; 2310 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2311 } 2312 2313 /* 2314 * Add the buffer to the sorted clean or dirty block list. 2315 * 2316 * NOTE: xflags is passed as a constant, optimizing this inline function! 2317 */ 2318 static void 2319 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2320 { 2321 struct bufv *bv; 2322 struct buf *n; 2323 int error; 2324 2325 ASSERT_BO_WLOCKED(bo); 2326 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2327 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2328 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2329 ("dead bo %p", bo)); 2330 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2331 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2332 bp->b_xflags |= xflags; 2333 if (xflags & BX_VNDIRTY) 2334 bv = &bo->bo_dirty; 2335 else 2336 bv = &bo->bo_clean; 2337 2338 /* 2339 * Keep the list ordered. Optimize empty list insertion. Assume 2340 * we tend to grow at the tail so lookup_le should usually be cheaper 2341 * than _ge. 2342 */ 2343 if (bv->bv_cnt == 0 || 2344 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2345 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2346 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2347 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2348 else 2349 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2350 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2351 if (error) 2352 panic("buf_vlist_add: Preallocated nodes insufficient."); 2353 bv->bv_cnt++; 2354 } 2355 2356 /* 2357 * Look up a buffer using the buffer tries. 2358 */ 2359 struct buf * 2360 gbincore(struct bufobj *bo, daddr_t lblkno) 2361 { 2362 struct buf *bp; 2363 2364 ASSERT_BO_LOCKED(bo); 2365 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2366 if (bp != NULL) 2367 return (bp); 2368 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2369 } 2370 2371 /* 2372 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2373 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2374 * stability of the result. Like other lockless lookups, the found buf may 2375 * already be invalid by the time this function returns. 2376 */ 2377 struct buf * 2378 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2379 { 2380 struct buf *bp; 2381 2382 ASSERT_BO_UNLOCKED(bo); 2383 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2384 if (bp != NULL) 2385 return (bp); 2386 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2387 } 2388 2389 /* 2390 * Associate a buffer with a vnode. 2391 */ 2392 void 2393 bgetvp(struct vnode *vp, struct buf *bp) 2394 { 2395 struct bufobj *bo; 2396 2397 bo = &vp->v_bufobj; 2398 ASSERT_BO_WLOCKED(bo); 2399 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2400 2401 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2402 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2403 ("bgetvp: bp already attached! %p", bp)); 2404 2405 vhold(vp); 2406 bp->b_vp = vp; 2407 bp->b_bufobj = bo; 2408 /* 2409 * Insert onto list for new vnode. 2410 */ 2411 buf_vlist_add(bp, bo, BX_VNCLEAN); 2412 } 2413 2414 /* 2415 * Disassociate a buffer from a vnode. 2416 */ 2417 void 2418 brelvp(struct buf *bp) 2419 { 2420 struct bufobj *bo; 2421 struct vnode *vp; 2422 2423 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2424 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2425 2426 /* 2427 * Delete from old vnode list, if on one. 2428 */ 2429 vp = bp->b_vp; /* XXX */ 2430 bo = bp->b_bufobj; 2431 BO_LOCK(bo); 2432 buf_vlist_remove(bp); 2433 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2434 bo->bo_flag &= ~BO_ONWORKLST; 2435 mtx_lock(&sync_mtx); 2436 LIST_REMOVE(bo, bo_synclist); 2437 syncer_worklist_len--; 2438 mtx_unlock(&sync_mtx); 2439 } 2440 bp->b_vp = NULL; 2441 bp->b_bufobj = NULL; 2442 BO_UNLOCK(bo); 2443 vdrop(vp); 2444 } 2445 2446 /* 2447 * Add an item to the syncer work queue. 2448 */ 2449 static void 2450 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2451 { 2452 int slot; 2453 2454 ASSERT_BO_WLOCKED(bo); 2455 2456 mtx_lock(&sync_mtx); 2457 if (bo->bo_flag & BO_ONWORKLST) 2458 LIST_REMOVE(bo, bo_synclist); 2459 else { 2460 bo->bo_flag |= BO_ONWORKLST; 2461 syncer_worklist_len++; 2462 } 2463 2464 if (delay > syncer_maxdelay - 2) 2465 delay = syncer_maxdelay - 2; 2466 slot = (syncer_delayno + delay) & syncer_mask; 2467 2468 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2469 mtx_unlock(&sync_mtx); 2470 } 2471 2472 static int 2473 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2474 { 2475 int error, len; 2476 2477 mtx_lock(&sync_mtx); 2478 len = syncer_worklist_len - sync_vnode_count; 2479 mtx_unlock(&sync_mtx); 2480 error = SYSCTL_OUT(req, &len, sizeof(len)); 2481 return (error); 2482 } 2483 2484 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2485 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2486 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2487 2488 static struct proc *updateproc; 2489 static void sched_sync(void); 2490 static struct kproc_desc up_kp = { 2491 "syncer", 2492 sched_sync, 2493 &updateproc 2494 }; 2495 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2496 2497 static int 2498 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2499 { 2500 struct vnode *vp; 2501 struct mount *mp; 2502 2503 *bo = LIST_FIRST(slp); 2504 if (*bo == NULL) 2505 return (0); 2506 vp = bo2vnode(*bo); 2507 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2508 return (1); 2509 /* 2510 * We use vhold in case the vnode does not 2511 * successfully sync. vhold prevents the vnode from 2512 * going away when we unlock the sync_mtx so that 2513 * we can acquire the vnode interlock. 2514 */ 2515 vholdl(vp); 2516 mtx_unlock(&sync_mtx); 2517 VI_UNLOCK(vp); 2518 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2519 vdrop(vp); 2520 mtx_lock(&sync_mtx); 2521 return (*bo == LIST_FIRST(slp)); 2522 } 2523 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2524 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2525 VOP_UNLOCK(vp); 2526 vn_finished_write(mp); 2527 BO_LOCK(*bo); 2528 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2529 /* 2530 * Put us back on the worklist. The worklist 2531 * routine will remove us from our current 2532 * position and then add us back in at a later 2533 * position. 2534 */ 2535 vn_syncer_add_to_worklist(*bo, syncdelay); 2536 } 2537 BO_UNLOCK(*bo); 2538 vdrop(vp); 2539 mtx_lock(&sync_mtx); 2540 return (0); 2541 } 2542 2543 static int first_printf = 1; 2544 2545 /* 2546 * System filesystem synchronizer daemon. 2547 */ 2548 static void 2549 sched_sync(void) 2550 { 2551 struct synclist *next, *slp; 2552 struct bufobj *bo; 2553 long starttime; 2554 struct thread *td = curthread; 2555 int last_work_seen; 2556 int net_worklist_len; 2557 int syncer_final_iter; 2558 int error; 2559 2560 last_work_seen = 0; 2561 syncer_final_iter = 0; 2562 syncer_state = SYNCER_RUNNING; 2563 starttime = time_uptime; 2564 td->td_pflags |= TDP_NORUNNINGBUF; 2565 2566 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2567 SHUTDOWN_PRI_LAST); 2568 2569 mtx_lock(&sync_mtx); 2570 for (;;) { 2571 if (syncer_state == SYNCER_FINAL_DELAY && 2572 syncer_final_iter == 0) { 2573 mtx_unlock(&sync_mtx); 2574 kproc_suspend_check(td->td_proc); 2575 mtx_lock(&sync_mtx); 2576 } 2577 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2578 if (syncer_state != SYNCER_RUNNING && 2579 starttime != time_uptime) { 2580 if (first_printf) { 2581 printf("\nSyncing disks, vnodes remaining... "); 2582 first_printf = 0; 2583 } 2584 printf("%d ", net_worklist_len); 2585 } 2586 starttime = time_uptime; 2587 2588 /* 2589 * Push files whose dirty time has expired. Be careful 2590 * of interrupt race on slp queue. 2591 * 2592 * Skip over empty worklist slots when shutting down. 2593 */ 2594 do { 2595 slp = &syncer_workitem_pending[syncer_delayno]; 2596 syncer_delayno += 1; 2597 if (syncer_delayno == syncer_maxdelay) 2598 syncer_delayno = 0; 2599 next = &syncer_workitem_pending[syncer_delayno]; 2600 /* 2601 * If the worklist has wrapped since the 2602 * it was emptied of all but syncer vnodes, 2603 * switch to the FINAL_DELAY state and run 2604 * for one more second. 2605 */ 2606 if (syncer_state == SYNCER_SHUTTING_DOWN && 2607 net_worklist_len == 0 && 2608 last_work_seen == syncer_delayno) { 2609 syncer_state = SYNCER_FINAL_DELAY; 2610 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2611 } 2612 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2613 syncer_worklist_len > 0); 2614 2615 /* 2616 * Keep track of the last time there was anything 2617 * on the worklist other than syncer vnodes. 2618 * Return to the SHUTTING_DOWN state if any 2619 * new work appears. 2620 */ 2621 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2622 last_work_seen = syncer_delayno; 2623 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2624 syncer_state = SYNCER_SHUTTING_DOWN; 2625 while (!LIST_EMPTY(slp)) { 2626 error = sync_vnode(slp, &bo, td); 2627 if (error == 1) { 2628 LIST_REMOVE(bo, bo_synclist); 2629 LIST_INSERT_HEAD(next, bo, bo_synclist); 2630 continue; 2631 } 2632 2633 if (first_printf == 0) { 2634 /* 2635 * Drop the sync mutex, because some watchdog 2636 * drivers need to sleep while patting 2637 */ 2638 mtx_unlock(&sync_mtx); 2639 wdog_kern_pat(WD_LASTVAL); 2640 mtx_lock(&sync_mtx); 2641 } 2642 } 2643 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2644 syncer_final_iter--; 2645 /* 2646 * The variable rushjob allows the kernel to speed up the 2647 * processing of the filesystem syncer process. A rushjob 2648 * value of N tells the filesystem syncer to process the next 2649 * N seconds worth of work on its queue ASAP. Currently rushjob 2650 * is used by the soft update code to speed up the filesystem 2651 * syncer process when the incore state is getting so far 2652 * ahead of the disk that the kernel memory pool is being 2653 * threatened with exhaustion. 2654 */ 2655 if (rushjob > 0) { 2656 rushjob -= 1; 2657 continue; 2658 } 2659 /* 2660 * Just sleep for a short period of time between 2661 * iterations when shutting down to allow some I/O 2662 * to happen. 2663 * 2664 * If it has taken us less than a second to process the 2665 * current work, then wait. Otherwise start right over 2666 * again. We can still lose time if any single round 2667 * takes more than two seconds, but it does not really 2668 * matter as we are just trying to generally pace the 2669 * filesystem activity. 2670 */ 2671 if (syncer_state != SYNCER_RUNNING || 2672 time_uptime == starttime) { 2673 thread_lock(td); 2674 sched_prio(td, PPAUSE); 2675 thread_unlock(td); 2676 } 2677 if (syncer_state != SYNCER_RUNNING) 2678 cv_timedwait(&sync_wakeup, &sync_mtx, 2679 hz / SYNCER_SHUTDOWN_SPEEDUP); 2680 else if (time_uptime == starttime) 2681 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2682 } 2683 } 2684 2685 /* 2686 * Request the syncer daemon to speed up its work. 2687 * We never push it to speed up more than half of its 2688 * normal turn time, otherwise it could take over the cpu. 2689 */ 2690 int 2691 speedup_syncer(void) 2692 { 2693 int ret = 0; 2694 2695 mtx_lock(&sync_mtx); 2696 if (rushjob < syncdelay / 2) { 2697 rushjob += 1; 2698 stat_rush_requests += 1; 2699 ret = 1; 2700 } 2701 mtx_unlock(&sync_mtx); 2702 cv_broadcast(&sync_wakeup); 2703 return (ret); 2704 } 2705 2706 /* 2707 * Tell the syncer to speed up its work and run though its work 2708 * list several times, then tell it to shut down. 2709 */ 2710 static void 2711 syncer_shutdown(void *arg, int howto) 2712 { 2713 2714 if (howto & RB_NOSYNC) 2715 return; 2716 mtx_lock(&sync_mtx); 2717 syncer_state = SYNCER_SHUTTING_DOWN; 2718 rushjob = 0; 2719 mtx_unlock(&sync_mtx); 2720 cv_broadcast(&sync_wakeup); 2721 kproc_shutdown(arg, howto); 2722 } 2723 2724 void 2725 syncer_suspend(void) 2726 { 2727 2728 syncer_shutdown(updateproc, 0); 2729 } 2730 2731 void 2732 syncer_resume(void) 2733 { 2734 2735 mtx_lock(&sync_mtx); 2736 first_printf = 1; 2737 syncer_state = SYNCER_RUNNING; 2738 mtx_unlock(&sync_mtx); 2739 cv_broadcast(&sync_wakeup); 2740 kproc_resume(updateproc); 2741 } 2742 2743 /* 2744 * Move the buffer between the clean and dirty lists of its vnode. 2745 */ 2746 void 2747 reassignbuf(struct buf *bp) 2748 { 2749 struct vnode *vp; 2750 struct bufobj *bo; 2751 int delay; 2752 #ifdef INVARIANTS 2753 struct bufv *bv; 2754 #endif 2755 2756 vp = bp->b_vp; 2757 bo = bp->b_bufobj; 2758 2759 KASSERT((bp->b_flags & B_PAGING) == 0, 2760 ("%s: cannot reassign paging buffer %p", __func__, bp)); 2761 2762 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2763 bp, bp->b_vp, bp->b_flags); 2764 2765 BO_LOCK(bo); 2766 buf_vlist_remove(bp); 2767 2768 /* 2769 * If dirty, put on list of dirty buffers; otherwise insert onto list 2770 * of clean buffers. 2771 */ 2772 if (bp->b_flags & B_DELWRI) { 2773 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2774 switch (vp->v_type) { 2775 case VDIR: 2776 delay = dirdelay; 2777 break; 2778 case VCHR: 2779 delay = metadelay; 2780 break; 2781 default: 2782 delay = filedelay; 2783 } 2784 vn_syncer_add_to_worklist(bo, delay); 2785 } 2786 buf_vlist_add(bp, bo, BX_VNDIRTY); 2787 } else { 2788 buf_vlist_add(bp, bo, BX_VNCLEAN); 2789 2790 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2791 mtx_lock(&sync_mtx); 2792 LIST_REMOVE(bo, bo_synclist); 2793 syncer_worklist_len--; 2794 mtx_unlock(&sync_mtx); 2795 bo->bo_flag &= ~BO_ONWORKLST; 2796 } 2797 } 2798 #ifdef INVARIANTS 2799 bv = &bo->bo_clean; 2800 bp = TAILQ_FIRST(&bv->bv_hd); 2801 KASSERT(bp == NULL || bp->b_bufobj == bo, 2802 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2803 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2804 KASSERT(bp == NULL || bp->b_bufobj == bo, 2805 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2806 bv = &bo->bo_dirty; 2807 bp = TAILQ_FIRST(&bv->bv_hd); 2808 KASSERT(bp == NULL || bp->b_bufobj == bo, 2809 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2810 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2811 KASSERT(bp == NULL || bp->b_bufobj == bo, 2812 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2813 #endif 2814 BO_UNLOCK(bo); 2815 } 2816 2817 static void 2818 v_init_counters(struct vnode *vp) 2819 { 2820 2821 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2822 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2823 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2824 2825 refcount_init(&vp->v_holdcnt, 1); 2826 refcount_init(&vp->v_usecount, 1); 2827 } 2828 2829 /* 2830 * Grab a particular vnode from the free list, increment its 2831 * reference count and lock it. VIRF_DOOMED is set if the vnode 2832 * is being destroyed. Only callers who specify LK_RETRY will 2833 * see doomed vnodes. If inactive processing was delayed in 2834 * vput try to do it here. 2835 * 2836 * usecount is manipulated using atomics without holding any locks. 2837 * 2838 * holdcnt can be manipulated using atomics without holding any locks, 2839 * except when transitioning 1<->0, in which case the interlock is held. 2840 * 2841 * Consumers which don't guarantee liveness of the vnode can use SMR to 2842 * try to get a reference. Note this operation can fail since the vnode 2843 * may be awaiting getting freed by the time they get to it. 2844 */ 2845 enum vgetstate 2846 vget_prep_smr(struct vnode *vp) 2847 { 2848 enum vgetstate vs; 2849 2850 VFS_SMR_ASSERT_ENTERED(); 2851 2852 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2853 vs = VGET_USECOUNT; 2854 } else { 2855 if (vhold_smr(vp)) 2856 vs = VGET_HOLDCNT; 2857 else 2858 vs = VGET_NONE; 2859 } 2860 return (vs); 2861 } 2862 2863 enum vgetstate 2864 vget_prep(struct vnode *vp) 2865 { 2866 enum vgetstate vs; 2867 2868 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2869 vs = VGET_USECOUNT; 2870 } else { 2871 vhold(vp); 2872 vs = VGET_HOLDCNT; 2873 } 2874 return (vs); 2875 } 2876 2877 void 2878 vget_abort(struct vnode *vp, enum vgetstate vs) 2879 { 2880 2881 switch (vs) { 2882 case VGET_USECOUNT: 2883 vrele(vp); 2884 break; 2885 case VGET_HOLDCNT: 2886 vdrop(vp); 2887 break; 2888 default: 2889 __assert_unreachable(); 2890 } 2891 } 2892 2893 int 2894 vget(struct vnode *vp, int flags) 2895 { 2896 enum vgetstate vs; 2897 2898 vs = vget_prep(vp); 2899 return (vget_finish(vp, flags, vs)); 2900 } 2901 2902 int 2903 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 2904 { 2905 int error; 2906 2907 if ((flags & LK_INTERLOCK) != 0) 2908 ASSERT_VI_LOCKED(vp, __func__); 2909 else 2910 ASSERT_VI_UNLOCKED(vp, __func__); 2911 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 2912 VNPASS(vp->v_holdcnt > 0, vp); 2913 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 2914 2915 error = vn_lock(vp, flags); 2916 if (__predict_false(error != 0)) { 2917 vget_abort(vp, vs); 2918 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2919 vp); 2920 return (error); 2921 } 2922 2923 vget_finish_ref(vp, vs); 2924 return (0); 2925 } 2926 2927 void 2928 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 2929 { 2930 int old; 2931 2932 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 2933 VNPASS(vp->v_holdcnt > 0, vp); 2934 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 2935 2936 if (vs == VGET_USECOUNT) 2937 return; 2938 2939 /* 2940 * We hold the vnode. If the usecount is 0 it will be utilized to keep 2941 * the vnode around. Otherwise someone else lended their hold count and 2942 * we have to drop ours. 2943 */ 2944 old = atomic_fetchadd_int(&vp->v_usecount, 1); 2945 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 2946 if (old != 0) { 2947 #ifdef INVARIANTS 2948 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 2949 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 2950 #else 2951 refcount_release(&vp->v_holdcnt); 2952 #endif 2953 } 2954 } 2955 2956 void 2957 vref(struct vnode *vp) 2958 { 2959 enum vgetstate vs; 2960 2961 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2962 vs = vget_prep(vp); 2963 vget_finish_ref(vp, vs); 2964 } 2965 2966 void 2967 vrefact(struct vnode *vp) 2968 { 2969 2970 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2971 #ifdef INVARIANTS 2972 int old = atomic_fetchadd_int(&vp->v_usecount, 1); 2973 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 2974 #else 2975 refcount_acquire(&vp->v_usecount); 2976 #endif 2977 } 2978 2979 void 2980 vlazy(struct vnode *vp) 2981 { 2982 struct mount *mp; 2983 2984 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 2985 2986 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 2987 return; 2988 /* 2989 * We may get here for inactive routines after the vnode got doomed. 2990 */ 2991 if (VN_IS_DOOMED(vp)) 2992 return; 2993 mp = vp->v_mount; 2994 mtx_lock(&mp->mnt_listmtx); 2995 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 2996 vp->v_mflag |= VMP_LAZYLIST; 2997 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 2998 mp->mnt_lazyvnodelistsize++; 2999 } 3000 mtx_unlock(&mp->mnt_listmtx); 3001 } 3002 3003 /* 3004 * This routine is only meant to be called from vgonel prior to dooming 3005 * the vnode. 3006 */ 3007 static void 3008 vunlazy_gone(struct vnode *vp) 3009 { 3010 struct mount *mp; 3011 3012 ASSERT_VOP_ELOCKED(vp, __func__); 3013 ASSERT_VI_LOCKED(vp, __func__); 3014 VNPASS(!VN_IS_DOOMED(vp), vp); 3015 3016 if (vp->v_mflag & VMP_LAZYLIST) { 3017 mp = vp->v_mount; 3018 mtx_lock(&mp->mnt_listmtx); 3019 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3020 vp->v_mflag &= ~VMP_LAZYLIST; 3021 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3022 mp->mnt_lazyvnodelistsize--; 3023 mtx_unlock(&mp->mnt_listmtx); 3024 } 3025 } 3026 3027 static void 3028 vdefer_inactive(struct vnode *vp) 3029 { 3030 3031 ASSERT_VI_LOCKED(vp, __func__); 3032 VNASSERT(vp->v_holdcnt > 0, vp, 3033 ("%s: vnode without hold count", __func__)); 3034 if (VN_IS_DOOMED(vp)) { 3035 vdropl(vp); 3036 return; 3037 } 3038 if (vp->v_iflag & VI_DEFINACT) { 3039 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 3040 vdropl(vp); 3041 return; 3042 } 3043 if (vp->v_usecount > 0) { 3044 vp->v_iflag &= ~VI_OWEINACT; 3045 vdropl(vp); 3046 return; 3047 } 3048 vlazy(vp); 3049 vp->v_iflag |= VI_DEFINACT; 3050 VI_UNLOCK(vp); 3051 counter_u64_add(deferred_inact, 1); 3052 } 3053 3054 static void 3055 vdefer_inactive_unlocked(struct vnode *vp) 3056 { 3057 3058 VI_LOCK(vp); 3059 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3060 vdropl(vp); 3061 return; 3062 } 3063 vdefer_inactive(vp); 3064 } 3065 3066 enum vput_op { VRELE, VPUT, VUNREF }; 3067 3068 /* 3069 * Handle ->v_usecount transitioning to 0. 3070 * 3071 * By releasing the last usecount we take ownership of the hold count which 3072 * provides liveness of the vnode, meaning we have to vdrop. 3073 * 3074 * For all vnodes we may need to perform inactive processing. It requires an 3075 * exclusive lock on the vnode, while it is legal to call here with only a 3076 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3077 * inactive processing gets deferred to the syncer. 3078 * 3079 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3080 * on the lock being held all the way until VOP_INACTIVE. This in particular 3081 * happens with UFS which adds half-constructed vnodes to the hash, where they 3082 * can be found by other code. 3083 */ 3084 static void 3085 vput_final(struct vnode *vp, enum vput_op func) 3086 { 3087 int error; 3088 bool want_unlock; 3089 3090 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3091 VNPASS(vp->v_holdcnt > 0, vp); 3092 3093 VI_LOCK(vp); 3094 3095 /* 3096 * By the time we got here someone else might have transitioned 3097 * the count back to > 0. 3098 */ 3099 if (vp->v_usecount > 0) 3100 goto out; 3101 3102 /* 3103 * If the vnode is doomed vgone already performed inactive processing 3104 * (if needed). 3105 */ 3106 if (VN_IS_DOOMED(vp)) 3107 goto out; 3108 3109 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3110 goto out; 3111 3112 if (vp->v_iflag & VI_DOINGINACT) 3113 goto out; 3114 3115 /* 3116 * Locking operations here will drop the interlock and possibly the 3117 * vnode lock, opening a window where the vnode can get doomed all the 3118 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3119 * perform inactive. 3120 */ 3121 vp->v_iflag |= VI_OWEINACT; 3122 want_unlock = false; 3123 error = 0; 3124 switch (func) { 3125 case VRELE: 3126 switch (VOP_ISLOCKED(vp)) { 3127 case LK_EXCLUSIVE: 3128 break; 3129 case LK_EXCLOTHER: 3130 case 0: 3131 want_unlock = true; 3132 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3133 VI_LOCK(vp); 3134 break; 3135 default: 3136 /* 3137 * The lock has at least one sharer, but we have no way 3138 * to conclude whether this is us. Play it safe and 3139 * defer processing. 3140 */ 3141 error = EAGAIN; 3142 break; 3143 } 3144 break; 3145 case VPUT: 3146 want_unlock = true; 3147 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3148 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3149 LK_NOWAIT); 3150 VI_LOCK(vp); 3151 } 3152 break; 3153 case VUNREF: 3154 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3155 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3156 VI_LOCK(vp); 3157 } 3158 break; 3159 } 3160 if (error == 0) { 3161 if (func == VUNREF) { 3162 VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, 3163 ("recursive vunref")); 3164 vp->v_vflag |= VV_UNREF; 3165 } 3166 for (;;) { 3167 error = vinactive(vp); 3168 if (want_unlock) 3169 VOP_UNLOCK(vp); 3170 if (error != ERELOOKUP || !want_unlock) 3171 break; 3172 VOP_LOCK(vp, LK_EXCLUSIVE); 3173 } 3174 if (func == VUNREF) 3175 vp->v_vflag &= ~VV_UNREF; 3176 vdropl(vp); 3177 } else { 3178 vdefer_inactive(vp); 3179 } 3180 return; 3181 out: 3182 if (func == VPUT) 3183 VOP_UNLOCK(vp); 3184 vdropl(vp); 3185 } 3186 3187 /* 3188 * Decrement ->v_usecount for a vnode. 3189 * 3190 * Releasing the last use count requires additional processing, see vput_final 3191 * above for details. 3192 * 3193 * Comment above each variant denotes lock state on entry and exit. 3194 */ 3195 3196 /* 3197 * in: any 3198 * out: same as passed in 3199 */ 3200 void 3201 vrele(struct vnode *vp) 3202 { 3203 3204 ASSERT_VI_UNLOCKED(vp, __func__); 3205 if (!refcount_release(&vp->v_usecount)) 3206 return; 3207 vput_final(vp, VRELE); 3208 } 3209 3210 /* 3211 * in: locked 3212 * out: unlocked 3213 */ 3214 void 3215 vput(struct vnode *vp) 3216 { 3217 3218 ASSERT_VOP_LOCKED(vp, __func__); 3219 ASSERT_VI_UNLOCKED(vp, __func__); 3220 if (!refcount_release(&vp->v_usecount)) { 3221 VOP_UNLOCK(vp); 3222 return; 3223 } 3224 vput_final(vp, VPUT); 3225 } 3226 3227 /* 3228 * in: locked 3229 * out: locked 3230 */ 3231 void 3232 vunref(struct vnode *vp) 3233 { 3234 3235 ASSERT_VOP_LOCKED(vp, __func__); 3236 ASSERT_VI_UNLOCKED(vp, __func__); 3237 if (!refcount_release(&vp->v_usecount)) 3238 return; 3239 vput_final(vp, VUNREF); 3240 } 3241 3242 void 3243 vhold(struct vnode *vp) 3244 { 3245 int old; 3246 3247 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3248 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3249 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3250 ("%s: wrong hold count %d", __func__, old)); 3251 if (old == 0) 3252 vn_freevnodes_dec(); 3253 } 3254 3255 void 3256 vholdnz(struct vnode *vp) 3257 { 3258 3259 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3260 #ifdef INVARIANTS 3261 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3262 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3263 ("%s: wrong hold count %d", __func__, old)); 3264 #else 3265 atomic_add_int(&vp->v_holdcnt, 1); 3266 #endif 3267 } 3268 3269 /* 3270 * Grab a hold count unless the vnode is freed. 3271 * 3272 * Only use this routine if vfs smr is the only protection you have against 3273 * freeing the vnode. 3274 * 3275 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3276 * is not set. After the flag is set the vnode becomes immutable to anyone but 3277 * the thread which managed to set the flag. 3278 * 3279 * It may be tempting to replace the loop with: 3280 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3281 * if (count & VHOLD_NO_SMR) { 3282 * backpedal and error out; 3283 * } 3284 * 3285 * However, while this is more performant, it hinders debugging by eliminating 3286 * the previously mentioned invariant. 3287 */ 3288 bool 3289 vhold_smr(struct vnode *vp) 3290 { 3291 int count; 3292 3293 VFS_SMR_ASSERT_ENTERED(); 3294 3295 count = atomic_load_int(&vp->v_holdcnt); 3296 for (;;) { 3297 if (count & VHOLD_NO_SMR) { 3298 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3299 ("non-zero hold count with flags %d\n", count)); 3300 return (false); 3301 } 3302 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3303 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3304 if (count == 0) 3305 vn_freevnodes_dec(); 3306 return (true); 3307 } 3308 } 3309 } 3310 3311 /* 3312 * Hold a free vnode for recycling. 3313 * 3314 * Note: vnode_init references this comment. 3315 * 3316 * Attempts to recycle only need the global vnode list lock and have no use for 3317 * SMR. 3318 * 3319 * However, vnodes get inserted into the global list before they get fully 3320 * initialized and stay there until UMA decides to free the memory. This in 3321 * particular means the target can be found before it becomes usable and after 3322 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to 3323 * VHOLD_NO_SMR. 3324 * 3325 * Note: the vnode may gain more references after we transition the count 0->1. 3326 */ 3327 static bool 3328 vhold_recycle_free(struct vnode *vp) 3329 { 3330 int count; 3331 3332 mtx_assert(&vnode_list_mtx, MA_OWNED); 3333 3334 count = atomic_load_int(&vp->v_holdcnt); 3335 for (;;) { 3336 if (count & VHOLD_NO_SMR) { 3337 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3338 ("non-zero hold count with flags %d\n", count)); 3339 return (false); 3340 } 3341 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3342 if (count > 0) { 3343 return (false); 3344 } 3345 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3346 vn_freevnodes_dec(); 3347 return (true); 3348 } 3349 } 3350 } 3351 3352 static void __noinline 3353 vdbatch_process(struct vdbatch *vd) 3354 { 3355 struct vnode *vp; 3356 int i; 3357 3358 mtx_assert(&vd->lock, MA_OWNED); 3359 MPASS(curthread->td_pinned > 0); 3360 MPASS(vd->index == VDBATCH_SIZE); 3361 3362 mtx_lock(&vnode_list_mtx); 3363 critical_enter(); 3364 freevnodes += vd->freevnodes; 3365 for (i = 0; i < VDBATCH_SIZE; i++) { 3366 vp = vd->tab[i]; 3367 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3368 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3369 MPASS(vp->v_dbatchcpu != NOCPU); 3370 vp->v_dbatchcpu = NOCPU; 3371 } 3372 mtx_unlock(&vnode_list_mtx); 3373 vd->freevnodes = 0; 3374 bzero(vd->tab, sizeof(vd->tab)); 3375 vd->index = 0; 3376 critical_exit(); 3377 } 3378 3379 static void 3380 vdbatch_enqueue(struct vnode *vp) 3381 { 3382 struct vdbatch *vd; 3383 3384 ASSERT_VI_LOCKED(vp, __func__); 3385 VNASSERT(!VN_IS_DOOMED(vp), vp, 3386 ("%s: deferring requeue of a doomed vnode", __func__)); 3387 3388 if (vp->v_dbatchcpu != NOCPU) { 3389 VI_UNLOCK(vp); 3390 return; 3391 } 3392 3393 sched_pin(); 3394 vd = DPCPU_PTR(vd); 3395 mtx_lock(&vd->lock); 3396 MPASS(vd->index < VDBATCH_SIZE); 3397 MPASS(vd->tab[vd->index] == NULL); 3398 /* 3399 * A hack: we depend on being pinned so that we know what to put in 3400 * ->v_dbatchcpu. 3401 */ 3402 vp->v_dbatchcpu = curcpu; 3403 vd->tab[vd->index] = vp; 3404 vd->index++; 3405 VI_UNLOCK(vp); 3406 if (vd->index == VDBATCH_SIZE) 3407 vdbatch_process(vd); 3408 mtx_unlock(&vd->lock); 3409 sched_unpin(); 3410 } 3411 3412 /* 3413 * This routine must only be called for vnodes which are about to be 3414 * deallocated. Supporting dequeue for arbitrary vndoes would require 3415 * validating that the locked batch matches. 3416 */ 3417 static void 3418 vdbatch_dequeue(struct vnode *vp) 3419 { 3420 struct vdbatch *vd; 3421 int i; 3422 short cpu; 3423 3424 VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, 3425 ("%s: called for a used vnode\n", __func__)); 3426 3427 cpu = vp->v_dbatchcpu; 3428 if (cpu == NOCPU) 3429 return; 3430 3431 vd = DPCPU_ID_PTR(cpu, vd); 3432 mtx_lock(&vd->lock); 3433 for (i = 0; i < vd->index; i++) { 3434 if (vd->tab[i] != vp) 3435 continue; 3436 vp->v_dbatchcpu = NOCPU; 3437 vd->index--; 3438 vd->tab[i] = vd->tab[vd->index]; 3439 vd->tab[vd->index] = NULL; 3440 break; 3441 } 3442 mtx_unlock(&vd->lock); 3443 /* 3444 * Either we dequeued the vnode above or the target CPU beat us to it. 3445 */ 3446 MPASS(vp->v_dbatchcpu == NOCPU); 3447 } 3448 3449 /* 3450 * Drop the hold count of the vnode. If this is the last reference to 3451 * the vnode we place it on the free list unless it has been vgone'd 3452 * (marked VIRF_DOOMED) in which case we will free it. 3453 * 3454 * Because the vnode vm object keeps a hold reference on the vnode if 3455 * there is at least one resident non-cached page, the vnode cannot 3456 * leave the active list without the page cleanup done. 3457 */ 3458 static void 3459 vdrop_deactivate(struct vnode *vp) 3460 { 3461 struct mount *mp; 3462 3463 ASSERT_VI_LOCKED(vp, __func__); 3464 /* 3465 * Mark a vnode as free: remove it from its active list 3466 * and put it up for recycling on the freelist. 3467 */ 3468 VNASSERT(!VN_IS_DOOMED(vp), vp, 3469 ("vdrop: returning doomed vnode")); 3470 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 3471 ("vnode with VI_OWEINACT set")); 3472 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, 3473 ("vnode with VI_DEFINACT set")); 3474 if (vp->v_mflag & VMP_LAZYLIST) { 3475 mp = vp->v_mount; 3476 mtx_lock(&mp->mnt_listmtx); 3477 VNASSERT(vp->v_mflag & VMP_LAZYLIST, vp, ("lost VMP_LAZYLIST")); 3478 /* 3479 * Don't remove the vnode from the lazy list if another thread 3480 * has increased the hold count. It may have re-enqueued the 3481 * vnode to the lazy list and is now responsible for its 3482 * removal. 3483 */ 3484 if (vp->v_holdcnt == 0) { 3485 vp->v_mflag &= ~VMP_LAZYLIST; 3486 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3487 mp->mnt_lazyvnodelistsize--; 3488 } 3489 mtx_unlock(&mp->mnt_listmtx); 3490 } 3491 vdbatch_enqueue(vp); 3492 } 3493 3494 static void __noinline 3495 vdropl_final(struct vnode *vp) 3496 { 3497 3498 ASSERT_VI_LOCKED(vp, __func__); 3499 VNPASS(VN_IS_DOOMED(vp), vp); 3500 /* 3501 * Set the VHOLD_NO_SMR flag. 3502 * 3503 * We may be racing against vhold_smr. If they win we can just pretend 3504 * we never got this far, they will vdrop later. 3505 */ 3506 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { 3507 vn_freevnodes_inc(); 3508 VI_UNLOCK(vp); 3509 /* 3510 * We lost the aforementioned race. Any subsequent access is 3511 * invalid as they might have managed to vdropl on their own. 3512 */ 3513 return; 3514 } 3515 /* 3516 * Don't bump freevnodes as this one is going away. 3517 */ 3518 freevnode(vp); 3519 } 3520 3521 void 3522 vdrop(struct vnode *vp) 3523 { 3524 3525 ASSERT_VI_UNLOCKED(vp, __func__); 3526 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3527 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3528 return; 3529 VI_LOCK(vp); 3530 vdropl(vp); 3531 } 3532 3533 void 3534 vdropl(struct vnode *vp) 3535 { 3536 3537 ASSERT_VI_LOCKED(vp, __func__); 3538 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3539 if (!refcount_release(&vp->v_holdcnt)) { 3540 VI_UNLOCK(vp); 3541 return; 3542 } 3543 if (!VN_IS_DOOMED(vp)) { 3544 vn_freevnodes_inc(); 3545 vdrop_deactivate(vp); 3546 /* 3547 * Also unlocks the interlock. We can't assert on it as we 3548 * released our hold and by now the vnode might have been 3549 * freed. 3550 */ 3551 return; 3552 } 3553 vdropl_final(vp); 3554 } 3555 3556 /* 3557 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3558 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3559 */ 3560 static int 3561 vinactivef(struct vnode *vp) 3562 { 3563 struct vm_object *obj; 3564 int error; 3565 3566 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3567 ASSERT_VI_LOCKED(vp, "vinactive"); 3568 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 3569 ("vinactive: recursed on VI_DOINGINACT")); 3570 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3571 vp->v_iflag |= VI_DOINGINACT; 3572 vp->v_iflag &= ~VI_OWEINACT; 3573 VI_UNLOCK(vp); 3574 /* 3575 * Before moving off the active list, we must be sure that any 3576 * modified pages are converted into the vnode's dirty 3577 * buffers, since these will no longer be checked once the 3578 * vnode is on the inactive list. 3579 * 3580 * The write-out of the dirty pages is asynchronous. At the 3581 * point that VOP_INACTIVE() is called, there could still be 3582 * pending I/O and dirty pages in the object. 3583 */ 3584 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 3585 vm_object_mightbedirty(obj)) { 3586 VM_OBJECT_WLOCK(obj); 3587 vm_object_page_clean(obj, 0, 0, 0); 3588 VM_OBJECT_WUNLOCK(obj); 3589 } 3590 error = VOP_INACTIVE(vp); 3591 VI_LOCK(vp); 3592 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 3593 ("vinactive: lost VI_DOINGINACT")); 3594 vp->v_iflag &= ~VI_DOINGINACT; 3595 return (error); 3596 } 3597 3598 int 3599 vinactive(struct vnode *vp) 3600 { 3601 3602 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3603 ASSERT_VI_LOCKED(vp, "vinactive"); 3604 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3605 3606 if ((vp->v_iflag & VI_OWEINACT) == 0) 3607 return (0); 3608 if (vp->v_iflag & VI_DOINGINACT) 3609 return (0); 3610 if (vp->v_usecount > 0) { 3611 vp->v_iflag &= ~VI_OWEINACT; 3612 return (0); 3613 } 3614 return (vinactivef(vp)); 3615 } 3616 3617 /* 3618 * Remove any vnodes in the vnode table belonging to mount point mp. 3619 * 3620 * If FORCECLOSE is not specified, there should not be any active ones, 3621 * return error if any are found (nb: this is a user error, not a 3622 * system error). If FORCECLOSE is specified, detach any active vnodes 3623 * that are found. 3624 * 3625 * If WRITECLOSE is set, only flush out regular file vnodes open for 3626 * writing. 3627 * 3628 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 3629 * 3630 * `rootrefs' specifies the base reference count for the root vnode 3631 * of this filesystem. The root vnode is considered busy if its 3632 * v_usecount exceeds this value. On a successful return, vflush(, td) 3633 * will call vrele() on the root vnode exactly rootrefs times. 3634 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 3635 * be zero. 3636 */ 3637 #ifdef DIAGNOSTIC 3638 static int busyprt = 0; /* print out busy vnodes */ 3639 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 3640 #endif 3641 3642 int 3643 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 3644 { 3645 struct vnode *vp, *mvp, *rootvp = NULL; 3646 struct vattr vattr; 3647 int busy = 0, error; 3648 3649 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 3650 rootrefs, flags); 3651 if (rootrefs > 0) { 3652 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 3653 ("vflush: bad args")); 3654 /* 3655 * Get the filesystem root vnode. We can vput() it 3656 * immediately, since with rootrefs > 0, it won't go away. 3657 */ 3658 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 3659 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 3660 __func__, error); 3661 return (error); 3662 } 3663 vput(rootvp); 3664 } 3665 loop: 3666 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3667 vholdl(vp); 3668 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 3669 if (error) { 3670 vdrop(vp); 3671 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3672 goto loop; 3673 } 3674 /* 3675 * Skip over a vnodes marked VV_SYSTEM. 3676 */ 3677 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 3678 VOP_UNLOCK(vp); 3679 vdrop(vp); 3680 continue; 3681 } 3682 /* 3683 * If WRITECLOSE is set, flush out unlinked but still open 3684 * files (even if open only for reading) and regular file 3685 * vnodes open for writing. 3686 */ 3687 if (flags & WRITECLOSE) { 3688 if (vp->v_object != NULL) { 3689 VM_OBJECT_WLOCK(vp->v_object); 3690 vm_object_page_clean(vp->v_object, 0, 0, 0); 3691 VM_OBJECT_WUNLOCK(vp->v_object); 3692 } 3693 do { 3694 error = VOP_FSYNC(vp, MNT_WAIT, td); 3695 } while (error == ERELOOKUP); 3696 if (error != 0) { 3697 VOP_UNLOCK(vp); 3698 vdrop(vp); 3699 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3700 return (error); 3701 } 3702 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3703 VI_LOCK(vp); 3704 3705 if ((vp->v_type == VNON || 3706 (error == 0 && vattr.va_nlink > 0)) && 3707 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 3708 VOP_UNLOCK(vp); 3709 vdropl(vp); 3710 continue; 3711 } 3712 } else 3713 VI_LOCK(vp); 3714 /* 3715 * With v_usecount == 0, all we need to do is clear out the 3716 * vnode data structures and we are done. 3717 * 3718 * If FORCECLOSE is set, forcibly close the vnode. 3719 */ 3720 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3721 vgonel(vp); 3722 } else { 3723 busy++; 3724 #ifdef DIAGNOSTIC 3725 if (busyprt) 3726 vn_printf(vp, "vflush: busy vnode "); 3727 #endif 3728 } 3729 VOP_UNLOCK(vp); 3730 vdropl(vp); 3731 } 3732 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3733 /* 3734 * If just the root vnode is busy, and if its refcount 3735 * is equal to `rootrefs', then go ahead and kill it. 3736 */ 3737 VI_LOCK(rootvp); 3738 KASSERT(busy > 0, ("vflush: not busy")); 3739 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3740 ("vflush: usecount %d < rootrefs %d", 3741 rootvp->v_usecount, rootrefs)); 3742 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3743 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3744 vgone(rootvp); 3745 VOP_UNLOCK(rootvp); 3746 busy = 0; 3747 } else 3748 VI_UNLOCK(rootvp); 3749 } 3750 if (busy) { 3751 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3752 busy); 3753 return (EBUSY); 3754 } 3755 for (; rootrefs > 0; rootrefs--) 3756 vrele(rootvp); 3757 return (0); 3758 } 3759 3760 /* 3761 * Recycle an unused vnode to the front of the free list. 3762 */ 3763 int 3764 vrecycle(struct vnode *vp) 3765 { 3766 int recycled; 3767 3768 VI_LOCK(vp); 3769 recycled = vrecyclel(vp); 3770 VI_UNLOCK(vp); 3771 return (recycled); 3772 } 3773 3774 /* 3775 * vrecycle, with the vp interlock held. 3776 */ 3777 int 3778 vrecyclel(struct vnode *vp) 3779 { 3780 int recycled; 3781 3782 ASSERT_VOP_ELOCKED(vp, __func__); 3783 ASSERT_VI_LOCKED(vp, __func__); 3784 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3785 recycled = 0; 3786 if (vp->v_usecount == 0) { 3787 recycled = 1; 3788 vgonel(vp); 3789 } 3790 return (recycled); 3791 } 3792 3793 /* 3794 * Eliminate all activity associated with a vnode 3795 * in preparation for reuse. 3796 */ 3797 void 3798 vgone(struct vnode *vp) 3799 { 3800 VI_LOCK(vp); 3801 vgonel(vp); 3802 VI_UNLOCK(vp); 3803 } 3804 3805 static void 3806 notify_lowervp_vfs_dummy(struct mount *mp __unused, 3807 struct vnode *lowervp __unused) 3808 { 3809 } 3810 3811 /* 3812 * Notify upper mounts about reclaimed or unlinked vnode. 3813 */ 3814 void 3815 vfs_notify_upper(struct vnode *vp, int event) 3816 { 3817 static struct vfsops vgonel_vfsops = { 3818 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, 3819 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, 3820 }; 3821 struct mount *mp, *ump, *mmp; 3822 3823 mp = vp->v_mount; 3824 if (mp == NULL) 3825 return; 3826 if (TAILQ_EMPTY(&mp->mnt_uppers)) 3827 return; 3828 3829 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); 3830 mmp->mnt_op = &vgonel_vfsops; 3831 mmp->mnt_kern_flag |= MNTK_MARKER; 3832 MNT_ILOCK(mp); 3833 mp->mnt_kern_flag |= MNTK_VGONE_UPPER; 3834 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { 3835 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { 3836 ump = TAILQ_NEXT(ump, mnt_upper_link); 3837 continue; 3838 } 3839 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); 3840 MNT_IUNLOCK(mp); 3841 switch (event) { 3842 case VFS_NOTIFY_UPPER_RECLAIM: 3843 VFS_RECLAIM_LOWERVP(ump, vp); 3844 break; 3845 case VFS_NOTIFY_UPPER_UNLINK: 3846 VFS_UNLINK_LOWERVP(ump, vp); 3847 break; 3848 default: 3849 KASSERT(0, ("invalid event %d", event)); 3850 break; 3851 } 3852 MNT_ILOCK(mp); 3853 ump = TAILQ_NEXT(mmp, mnt_upper_link); 3854 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); 3855 } 3856 free(mmp, M_TEMP); 3857 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; 3858 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { 3859 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; 3860 wakeup(&mp->mnt_uppers); 3861 } 3862 MNT_IUNLOCK(mp); 3863 } 3864 3865 /* 3866 * vgone, with the vp interlock held. 3867 */ 3868 static void 3869 vgonel(struct vnode *vp) 3870 { 3871 struct thread *td; 3872 struct mount *mp; 3873 vm_object_t object; 3874 bool active, doinginact, oweinact; 3875 3876 ASSERT_VOP_ELOCKED(vp, "vgonel"); 3877 ASSERT_VI_LOCKED(vp, "vgonel"); 3878 VNASSERT(vp->v_holdcnt, vp, 3879 ("vgonel: vp %p has no reference.", vp)); 3880 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3881 td = curthread; 3882 3883 /* 3884 * Don't vgonel if we're already doomed. 3885 */ 3886 if (VN_IS_DOOMED(vp)) 3887 return; 3888 /* 3889 * Paired with freevnode. 3890 */ 3891 vn_seqc_write_begin_locked(vp); 3892 vunlazy_gone(vp); 3893 vn_irflag_set_locked(vp, VIRF_DOOMED); 3894 3895 /* 3896 * Check to see if the vnode is in use. If so, we have to 3897 * call VOP_CLOSE() and VOP_INACTIVE(). 3898 * 3899 * It could be that VOP_INACTIVE() requested reclamation, in 3900 * which case we should avoid recursion, so check 3901 * VI_DOINGINACT. This is not precise but good enough. 3902 */ 3903 active = vp->v_usecount > 0; 3904 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 3905 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; 3906 3907 /* 3908 * If we need to do inactive VI_OWEINACT will be set. 3909 */ 3910 if (vp->v_iflag & VI_DEFINACT) { 3911 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 3912 vp->v_iflag &= ~VI_DEFINACT; 3913 vdropl(vp); 3914 } else { 3915 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 3916 VI_UNLOCK(vp); 3917 } 3918 cache_purge_vgone(vp); 3919 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 3920 3921 /* 3922 * If purging an active vnode, it must be closed and 3923 * deactivated before being reclaimed. 3924 */ 3925 if (active) 3926 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 3927 if (!doinginact) { 3928 do { 3929 if (oweinact || active) { 3930 VI_LOCK(vp); 3931 vinactivef(vp); 3932 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 3933 VI_UNLOCK(vp); 3934 } 3935 } while (oweinact); 3936 } 3937 if (vp->v_type == VSOCK) 3938 vfs_unp_reclaim(vp); 3939 3940 /* 3941 * Clean out any buffers associated with the vnode. 3942 * If the flush fails, just toss the buffers. 3943 */ 3944 mp = NULL; 3945 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 3946 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 3947 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 3948 while (vinvalbuf(vp, 0, 0, 0) != 0) 3949 ; 3950 } 3951 3952 BO_LOCK(&vp->v_bufobj); 3953 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 3954 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 3955 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 3956 vp->v_bufobj.bo_clean.bv_cnt == 0, 3957 ("vp %p bufobj not invalidated", vp)); 3958 3959 /* 3960 * For VMIO bufobj, BO_DEAD is set later, or in 3961 * vm_object_terminate() after the object's page queue is 3962 * flushed. 3963 */ 3964 object = vp->v_bufobj.bo_object; 3965 if (object == NULL) 3966 vp->v_bufobj.bo_flag |= BO_DEAD; 3967 BO_UNLOCK(&vp->v_bufobj); 3968 3969 /* 3970 * Handle the VM part. Tmpfs handles v_object on its own (the 3971 * OBJT_VNODE check). Nullfs or other bypassing filesystems 3972 * should not touch the object borrowed from the lower vnode 3973 * (the handle check). 3974 */ 3975 if (object != NULL && object->type == OBJT_VNODE && 3976 object->handle == vp) 3977 vnode_destroy_vobject(vp); 3978 3979 /* 3980 * Reclaim the vnode. 3981 */ 3982 if (VOP_RECLAIM(vp)) 3983 panic("vgone: cannot reclaim"); 3984 if (mp != NULL) 3985 vn_finished_secondary_write(mp); 3986 VNASSERT(vp->v_object == NULL, vp, 3987 ("vop_reclaim left v_object vp=%p", vp)); 3988 /* 3989 * Clear the advisory locks and wake up waiting threads. 3990 */ 3991 (void)VOP_ADVLOCKPURGE(vp); 3992 vp->v_lockf = NULL; 3993 /* 3994 * Delete from old mount point vnode list. 3995 */ 3996 delmntque(vp); 3997 /* 3998 * Done with purge, reset to the standard lock and invalidate 3999 * the vnode. 4000 */ 4001 VI_LOCK(vp); 4002 vp->v_vnlock = &vp->v_lock; 4003 vp->v_op = &dead_vnodeops; 4004 vp->v_type = VBAD; 4005 } 4006 4007 /* 4008 * Print out a description of a vnode. 4009 */ 4010 static const char * const typename[] = 4011 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 4012 "VMARKER"}; 4013 4014 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4015 "new hold count flag not added to vn_printf"); 4016 4017 void 4018 vn_printf(struct vnode *vp, const char *fmt, ...) 4019 { 4020 va_list ap; 4021 char buf[256], buf2[16]; 4022 u_long flags; 4023 u_int holdcnt; 4024 short irflag; 4025 4026 va_start(ap, fmt); 4027 vprintf(fmt, ap); 4028 va_end(ap); 4029 printf("%p: ", (void *)vp); 4030 printf("type %s\n", typename[vp->v_type]); 4031 holdcnt = atomic_load_int(&vp->v_holdcnt); 4032 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4033 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4034 vp->v_seqc_users); 4035 switch (vp->v_type) { 4036 case VDIR: 4037 printf(" mountedhere %p\n", vp->v_mountedhere); 4038 break; 4039 case VCHR: 4040 printf(" rdev %p\n", vp->v_rdev); 4041 break; 4042 case VSOCK: 4043 printf(" socket %p\n", vp->v_unpcb); 4044 break; 4045 case VFIFO: 4046 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4047 break; 4048 default: 4049 printf("\n"); 4050 break; 4051 } 4052 buf[0] = '\0'; 4053 buf[1] = '\0'; 4054 if (holdcnt & VHOLD_NO_SMR) 4055 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4056 printf(" hold count flags (%s)\n", buf + 1); 4057 4058 buf[0] = '\0'; 4059 buf[1] = '\0'; 4060 irflag = vn_irflag_read(vp); 4061 if (irflag & VIRF_DOOMED) 4062 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4063 if (irflag & VIRF_PGREAD) 4064 strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); 4065 if (irflag & VIRF_MOUNTPOINT) 4066 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); 4067 flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT); 4068 if (flags != 0) { 4069 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4070 strlcat(buf, buf2, sizeof(buf)); 4071 } 4072 if (vp->v_vflag & VV_ROOT) 4073 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4074 if (vp->v_vflag & VV_ISTTY) 4075 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4076 if (vp->v_vflag & VV_NOSYNC) 4077 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4078 if (vp->v_vflag & VV_ETERNALDEV) 4079 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4080 if (vp->v_vflag & VV_CACHEDLABEL) 4081 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4082 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4083 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4084 if (vp->v_vflag & VV_COPYONWRITE) 4085 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4086 if (vp->v_vflag & VV_SYSTEM) 4087 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4088 if (vp->v_vflag & VV_PROCDEP) 4089 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4090 if (vp->v_vflag & VV_NOKNOTE) 4091 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 4092 if (vp->v_vflag & VV_DELETED) 4093 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4094 if (vp->v_vflag & VV_MD) 4095 strlcat(buf, "|VV_MD", sizeof(buf)); 4096 if (vp->v_vflag & VV_FORCEINSMQ) 4097 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4098 if (vp->v_vflag & VV_READLINK) 4099 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4100 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4101 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | 4102 VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ | 4103 VV_READLINK); 4104 if (flags != 0) { 4105 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4106 strlcat(buf, buf2, sizeof(buf)); 4107 } 4108 if (vp->v_iflag & VI_TEXT_REF) 4109 strlcat(buf, "|VI_TEXT_REF", sizeof(buf)); 4110 if (vp->v_iflag & VI_MOUNT) 4111 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4112 if (vp->v_iflag & VI_DOINGINACT) 4113 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4114 if (vp->v_iflag & VI_OWEINACT) 4115 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4116 if (vp->v_iflag & VI_DEFINACT) 4117 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4118 if (vp->v_iflag & VI_FOPENING) 4119 strlcat(buf, "|VI_FOPENING", sizeof(buf)); 4120 flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT | 4121 VI_OWEINACT | VI_DEFINACT | VI_FOPENING); 4122 if (flags != 0) { 4123 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4124 strlcat(buf, buf2, sizeof(buf)); 4125 } 4126 if (vp->v_mflag & VMP_LAZYLIST) 4127 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4128 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4129 if (flags != 0) { 4130 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4131 strlcat(buf, buf2, sizeof(buf)); 4132 } 4133 printf(" flags (%s)", buf + 1); 4134 if (mtx_owned(VI_MTX(vp))) 4135 printf(" VI_LOCKed"); 4136 printf("\n"); 4137 if (vp->v_object != NULL) 4138 printf(" v_object %p ref %d pages %d " 4139 "cleanbuf %d dirtybuf %d\n", 4140 vp->v_object, vp->v_object->ref_count, 4141 vp->v_object->resident_page_count, 4142 vp->v_bufobj.bo_clean.bv_cnt, 4143 vp->v_bufobj.bo_dirty.bv_cnt); 4144 printf(" "); 4145 lockmgr_printinfo(vp->v_vnlock); 4146 if (vp->v_data != NULL) 4147 VOP_PRINT(vp); 4148 } 4149 4150 #ifdef DDB 4151 /* 4152 * List all of the locked vnodes in the system. 4153 * Called when debugging the kernel. 4154 */ 4155 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 4156 { 4157 struct mount *mp; 4158 struct vnode *vp; 4159 4160 /* 4161 * Note: because this is DDB, we can't obey the locking semantics 4162 * for these structures, which means we could catch an inconsistent 4163 * state and dereference a nasty pointer. Not much to be done 4164 * about that. 4165 */ 4166 db_printf("Locked vnodes\n"); 4167 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4168 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4169 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4170 vn_printf(vp, "vnode "); 4171 } 4172 } 4173 } 4174 4175 /* 4176 * Show details about the given vnode. 4177 */ 4178 DB_SHOW_COMMAND(vnode, db_show_vnode) 4179 { 4180 struct vnode *vp; 4181 4182 if (!have_addr) 4183 return; 4184 vp = (struct vnode *)addr; 4185 vn_printf(vp, "vnode "); 4186 } 4187 4188 /* 4189 * Show details about the given mount point. 4190 */ 4191 DB_SHOW_COMMAND(mount, db_show_mount) 4192 { 4193 struct mount *mp; 4194 struct vfsopt *opt; 4195 struct statfs *sp; 4196 struct vnode *vp; 4197 char buf[512]; 4198 uint64_t mflags; 4199 u_int flags; 4200 4201 if (!have_addr) { 4202 /* No address given, print short info about all mount points. */ 4203 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4204 db_printf("%p %s on %s (%s)\n", mp, 4205 mp->mnt_stat.f_mntfromname, 4206 mp->mnt_stat.f_mntonname, 4207 mp->mnt_stat.f_fstypename); 4208 if (db_pager_quit) 4209 break; 4210 } 4211 db_printf("\nMore info: show mount <addr>\n"); 4212 return; 4213 } 4214 4215 mp = (struct mount *)addr; 4216 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4217 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4218 4219 buf[0] = '\0'; 4220 mflags = mp->mnt_flag; 4221 #define MNT_FLAG(flag) do { \ 4222 if (mflags & (flag)) { \ 4223 if (buf[0] != '\0') \ 4224 strlcat(buf, ", ", sizeof(buf)); \ 4225 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4226 mflags &= ~(flag); \ 4227 } \ 4228 } while (0) 4229 MNT_FLAG(MNT_RDONLY); 4230 MNT_FLAG(MNT_SYNCHRONOUS); 4231 MNT_FLAG(MNT_NOEXEC); 4232 MNT_FLAG(MNT_NOSUID); 4233 MNT_FLAG(MNT_NFS4ACLS); 4234 MNT_FLAG(MNT_UNION); 4235 MNT_FLAG(MNT_ASYNC); 4236 MNT_FLAG(MNT_SUIDDIR); 4237 MNT_FLAG(MNT_SOFTDEP); 4238 MNT_FLAG(MNT_NOSYMFOLLOW); 4239 MNT_FLAG(MNT_GJOURNAL); 4240 MNT_FLAG(MNT_MULTILABEL); 4241 MNT_FLAG(MNT_ACLS); 4242 MNT_FLAG(MNT_NOATIME); 4243 MNT_FLAG(MNT_NOCLUSTERR); 4244 MNT_FLAG(MNT_NOCLUSTERW); 4245 MNT_FLAG(MNT_SUJ); 4246 MNT_FLAG(MNT_EXRDONLY); 4247 MNT_FLAG(MNT_EXPORTED); 4248 MNT_FLAG(MNT_DEFEXPORTED); 4249 MNT_FLAG(MNT_EXPORTANON); 4250 MNT_FLAG(MNT_EXKERB); 4251 MNT_FLAG(MNT_EXPUBLIC); 4252 MNT_FLAG(MNT_LOCAL); 4253 MNT_FLAG(MNT_QUOTA); 4254 MNT_FLAG(MNT_ROOTFS); 4255 MNT_FLAG(MNT_USER); 4256 MNT_FLAG(MNT_IGNORE); 4257 MNT_FLAG(MNT_UPDATE); 4258 MNT_FLAG(MNT_DELEXPORT); 4259 MNT_FLAG(MNT_RELOAD); 4260 MNT_FLAG(MNT_FORCE); 4261 MNT_FLAG(MNT_SNAPSHOT); 4262 MNT_FLAG(MNT_BYFSID); 4263 #undef MNT_FLAG 4264 if (mflags != 0) { 4265 if (buf[0] != '\0') 4266 strlcat(buf, ", ", sizeof(buf)); 4267 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4268 "0x%016jx", mflags); 4269 } 4270 db_printf(" mnt_flag = %s\n", buf); 4271 4272 buf[0] = '\0'; 4273 flags = mp->mnt_kern_flag; 4274 #define MNT_KERN_FLAG(flag) do { \ 4275 if (flags & (flag)) { \ 4276 if (buf[0] != '\0') \ 4277 strlcat(buf, ", ", sizeof(buf)); \ 4278 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4279 flags &= ~(flag); \ 4280 } \ 4281 } while (0) 4282 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4283 MNT_KERN_FLAG(MNTK_ASYNC); 4284 MNT_KERN_FLAG(MNTK_SOFTDEP); 4285 MNT_KERN_FLAG(MNTK_DRAINING); 4286 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4287 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4288 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4289 MNT_KERN_FLAG(MNTK_NO_IOPF); 4290 MNT_KERN_FLAG(MNTK_VGONE_UPPER); 4291 MNT_KERN_FLAG(MNTK_VGONE_WAITER); 4292 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); 4293 MNT_KERN_FLAG(MNTK_MARKER); 4294 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4295 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4296 MNT_KERN_FLAG(MNTK_NOASYNC); 4297 MNT_KERN_FLAG(MNTK_UNMOUNT); 4298 MNT_KERN_FLAG(MNTK_MWAIT); 4299 MNT_KERN_FLAG(MNTK_SUSPEND); 4300 MNT_KERN_FLAG(MNTK_SUSPEND2); 4301 MNT_KERN_FLAG(MNTK_SUSPENDED); 4302 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4303 MNT_KERN_FLAG(MNTK_NOKNOTE); 4304 #undef MNT_KERN_FLAG 4305 if (flags != 0) { 4306 if (buf[0] != '\0') 4307 strlcat(buf, ", ", sizeof(buf)); 4308 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4309 "0x%08x", flags); 4310 } 4311 db_printf(" mnt_kern_flag = %s\n", buf); 4312 4313 db_printf(" mnt_opt = "); 4314 opt = TAILQ_FIRST(mp->mnt_opt); 4315 if (opt != NULL) { 4316 db_printf("%s", opt->name); 4317 opt = TAILQ_NEXT(opt, link); 4318 while (opt != NULL) { 4319 db_printf(", %s", opt->name); 4320 opt = TAILQ_NEXT(opt, link); 4321 } 4322 } 4323 db_printf("\n"); 4324 4325 sp = &mp->mnt_stat; 4326 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4327 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4328 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4329 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4330 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4331 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4332 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4333 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4334 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4335 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4336 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4337 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4338 4339 db_printf(" mnt_cred = { uid=%u ruid=%u", 4340 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4341 if (jailed(mp->mnt_cred)) 4342 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4343 db_printf(" }\n"); 4344 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4345 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4346 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4347 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4348 db_printf(" mnt_lazyvnodelistsize = %d\n", 4349 mp->mnt_lazyvnodelistsize); 4350 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4351 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4352 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 4353 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4354 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4355 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4356 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4357 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4358 db_printf(" mnt_secondary_accwrites = %d\n", 4359 mp->mnt_secondary_accwrites); 4360 db_printf(" mnt_gjprovider = %s\n", 4361 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4362 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4363 4364 db_printf("\n\nList of active vnodes\n"); 4365 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4366 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4367 vn_printf(vp, "vnode "); 4368 if (db_pager_quit) 4369 break; 4370 } 4371 } 4372 db_printf("\n\nList of inactive vnodes\n"); 4373 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4374 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4375 vn_printf(vp, "vnode "); 4376 if (db_pager_quit) 4377 break; 4378 } 4379 } 4380 } 4381 #endif /* DDB */ 4382 4383 /* 4384 * Fill in a struct xvfsconf based on a struct vfsconf. 4385 */ 4386 static int 4387 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4388 { 4389 struct xvfsconf xvfsp; 4390 4391 bzero(&xvfsp, sizeof(xvfsp)); 4392 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4393 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4394 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4395 xvfsp.vfc_flags = vfsp->vfc_flags; 4396 /* 4397 * These are unused in userland, we keep them 4398 * to not break binary compatibility. 4399 */ 4400 xvfsp.vfc_vfsops = NULL; 4401 xvfsp.vfc_next = NULL; 4402 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4403 } 4404 4405 #ifdef COMPAT_FREEBSD32 4406 struct xvfsconf32 { 4407 uint32_t vfc_vfsops; 4408 char vfc_name[MFSNAMELEN]; 4409 int32_t vfc_typenum; 4410 int32_t vfc_refcount; 4411 int32_t vfc_flags; 4412 uint32_t vfc_next; 4413 }; 4414 4415 static int 4416 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4417 { 4418 struct xvfsconf32 xvfsp; 4419 4420 bzero(&xvfsp, sizeof(xvfsp)); 4421 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4422 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4423 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4424 xvfsp.vfc_flags = vfsp->vfc_flags; 4425 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4426 } 4427 #endif 4428 4429 /* 4430 * Top level filesystem related information gathering. 4431 */ 4432 static int 4433 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4434 { 4435 struct vfsconf *vfsp; 4436 int error; 4437 4438 error = 0; 4439 vfsconf_slock(); 4440 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4441 #ifdef COMPAT_FREEBSD32 4442 if (req->flags & SCTL_MASK32) 4443 error = vfsconf2x32(req, vfsp); 4444 else 4445 #endif 4446 error = vfsconf2x(req, vfsp); 4447 if (error) 4448 break; 4449 } 4450 vfsconf_sunlock(); 4451 return (error); 4452 } 4453 4454 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4455 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4456 "S,xvfsconf", "List of all configured filesystems"); 4457 4458 #ifndef BURN_BRIDGES 4459 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4460 4461 static int 4462 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4463 { 4464 int *name = (int *)arg1 - 1; /* XXX */ 4465 u_int namelen = arg2 + 1; /* XXX */ 4466 struct vfsconf *vfsp; 4467 4468 log(LOG_WARNING, "userland calling deprecated sysctl, " 4469 "please rebuild world\n"); 4470 4471 #if 1 || defined(COMPAT_PRELITE2) 4472 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4473 if (namelen == 1) 4474 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4475 #endif 4476 4477 switch (name[1]) { 4478 case VFS_MAXTYPENUM: 4479 if (namelen != 2) 4480 return (ENOTDIR); 4481 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4482 case VFS_CONF: 4483 if (namelen != 3) 4484 return (ENOTDIR); /* overloaded */ 4485 vfsconf_slock(); 4486 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4487 if (vfsp->vfc_typenum == name[2]) 4488 break; 4489 } 4490 vfsconf_sunlock(); 4491 if (vfsp == NULL) 4492 return (EOPNOTSUPP); 4493 #ifdef COMPAT_FREEBSD32 4494 if (req->flags & SCTL_MASK32) 4495 return (vfsconf2x32(req, vfsp)); 4496 else 4497 #endif 4498 return (vfsconf2x(req, vfsp)); 4499 } 4500 return (EOPNOTSUPP); 4501 } 4502 4503 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 4504 CTLFLAG_MPSAFE, vfs_sysctl, 4505 "Generic filesystem"); 4506 4507 #if 1 || defined(COMPAT_PRELITE2) 4508 4509 static int 4510 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 4511 { 4512 int error; 4513 struct vfsconf *vfsp; 4514 struct ovfsconf ovfs; 4515 4516 vfsconf_slock(); 4517 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4518 bzero(&ovfs, sizeof(ovfs)); 4519 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 4520 strcpy(ovfs.vfc_name, vfsp->vfc_name); 4521 ovfs.vfc_index = vfsp->vfc_typenum; 4522 ovfs.vfc_refcount = vfsp->vfc_refcount; 4523 ovfs.vfc_flags = vfsp->vfc_flags; 4524 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 4525 if (error != 0) { 4526 vfsconf_sunlock(); 4527 return (error); 4528 } 4529 } 4530 vfsconf_sunlock(); 4531 return (0); 4532 } 4533 4534 #endif /* 1 || COMPAT_PRELITE2 */ 4535 #endif /* !BURN_BRIDGES */ 4536 4537 #define KINFO_VNODESLOP 10 4538 #ifdef notyet 4539 /* 4540 * Dump vnode list (via sysctl). 4541 */ 4542 /* ARGSUSED */ 4543 static int 4544 sysctl_vnode(SYSCTL_HANDLER_ARGS) 4545 { 4546 struct xvnode *xvn; 4547 struct mount *mp; 4548 struct vnode *vp; 4549 int error, len, n; 4550 4551 /* 4552 * Stale numvnodes access is not fatal here. 4553 */ 4554 req->lock = 0; 4555 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 4556 if (!req->oldptr) 4557 /* Make an estimate */ 4558 return (SYSCTL_OUT(req, 0, len)); 4559 4560 error = sysctl_wire_old_buffer(req, 0); 4561 if (error != 0) 4562 return (error); 4563 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 4564 n = 0; 4565 mtx_lock(&mountlist_mtx); 4566 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4567 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 4568 continue; 4569 MNT_ILOCK(mp); 4570 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4571 if (n == len) 4572 break; 4573 vref(vp); 4574 xvn[n].xv_size = sizeof *xvn; 4575 xvn[n].xv_vnode = vp; 4576 xvn[n].xv_id = 0; /* XXX compat */ 4577 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 4578 XV_COPY(usecount); 4579 XV_COPY(writecount); 4580 XV_COPY(holdcnt); 4581 XV_COPY(mount); 4582 XV_COPY(numoutput); 4583 XV_COPY(type); 4584 #undef XV_COPY 4585 xvn[n].xv_flag = vp->v_vflag; 4586 4587 switch (vp->v_type) { 4588 case VREG: 4589 case VDIR: 4590 case VLNK: 4591 break; 4592 case VBLK: 4593 case VCHR: 4594 if (vp->v_rdev == NULL) { 4595 vrele(vp); 4596 continue; 4597 } 4598 xvn[n].xv_dev = dev2udev(vp->v_rdev); 4599 break; 4600 case VSOCK: 4601 xvn[n].xv_socket = vp->v_socket; 4602 break; 4603 case VFIFO: 4604 xvn[n].xv_fifo = vp->v_fifoinfo; 4605 break; 4606 case VNON: 4607 case VBAD: 4608 default: 4609 /* shouldn't happen? */ 4610 vrele(vp); 4611 continue; 4612 } 4613 vrele(vp); 4614 ++n; 4615 } 4616 MNT_IUNLOCK(mp); 4617 mtx_lock(&mountlist_mtx); 4618 vfs_unbusy(mp); 4619 if (n == len) 4620 break; 4621 } 4622 mtx_unlock(&mountlist_mtx); 4623 4624 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 4625 free(xvn, M_TEMP); 4626 return (error); 4627 } 4628 4629 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 4630 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 4631 ""); 4632 #endif 4633 4634 static void 4635 unmount_or_warn(struct mount *mp) 4636 { 4637 int error; 4638 4639 error = dounmount(mp, MNT_FORCE, curthread); 4640 if (error != 0) { 4641 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4642 if (error == EBUSY) 4643 printf("BUSY)\n"); 4644 else 4645 printf("%d)\n", error); 4646 } 4647 } 4648 4649 /* 4650 * Unmount all filesystems. The list is traversed in reverse order 4651 * of mounting to avoid dependencies. 4652 */ 4653 void 4654 vfs_unmountall(void) 4655 { 4656 struct mount *mp, *tmp; 4657 4658 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4659 4660 /* 4661 * Since this only runs when rebooting, it is not interlocked. 4662 */ 4663 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 4664 vfs_ref(mp); 4665 4666 /* 4667 * Forcibly unmounting "/dev" before "/" would prevent clean 4668 * unmount of the latter. 4669 */ 4670 if (mp == rootdevmp) 4671 continue; 4672 4673 unmount_or_warn(mp); 4674 } 4675 4676 if (rootdevmp != NULL) 4677 unmount_or_warn(rootdevmp); 4678 } 4679 4680 static void 4681 vfs_deferred_inactive(struct vnode *vp, int lkflags) 4682 { 4683 4684 ASSERT_VI_LOCKED(vp, __func__); 4685 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); 4686 if ((vp->v_iflag & VI_OWEINACT) == 0) { 4687 vdropl(vp); 4688 return; 4689 } 4690 if (vn_lock(vp, lkflags) == 0) { 4691 VI_LOCK(vp); 4692 vinactive(vp); 4693 VOP_UNLOCK(vp); 4694 vdropl(vp); 4695 return; 4696 } 4697 vdefer_inactive_unlocked(vp); 4698 } 4699 4700 static int 4701 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 4702 { 4703 4704 return (vp->v_iflag & VI_DEFINACT); 4705 } 4706 4707 static void __noinline 4708 vfs_periodic_inactive(struct mount *mp, int flags) 4709 { 4710 struct vnode *vp, *mvp; 4711 int lkflags; 4712 4713 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4714 if (flags != MNT_WAIT) 4715 lkflags |= LK_NOWAIT; 4716 4717 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 4718 if ((vp->v_iflag & VI_DEFINACT) == 0) { 4719 VI_UNLOCK(vp); 4720 continue; 4721 } 4722 vp->v_iflag &= ~VI_DEFINACT; 4723 vfs_deferred_inactive(vp, lkflags); 4724 } 4725 } 4726 4727 static inline bool 4728 vfs_want_msync(struct vnode *vp) 4729 { 4730 struct vm_object *obj; 4731 4732 /* 4733 * This test may be performed without any locks held. 4734 * We rely on vm_object's type stability. 4735 */ 4736 if (vp->v_vflag & VV_NOSYNC) 4737 return (false); 4738 obj = vp->v_object; 4739 return (obj != NULL && vm_object_mightbedirty(obj)); 4740 } 4741 4742 static int 4743 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 4744 { 4745 4746 if (vp->v_vflag & VV_NOSYNC) 4747 return (false); 4748 if (vp->v_iflag & VI_DEFINACT) 4749 return (true); 4750 return (vfs_want_msync(vp)); 4751 } 4752 4753 static void __noinline 4754 vfs_periodic_msync_inactive(struct mount *mp, int flags) 4755 { 4756 struct vnode *vp, *mvp; 4757 struct vm_object *obj; 4758 int lkflags, objflags; 4759 bool seen_defer; 4760 4761 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4762 if (flags != MNT_WAIT) { 4763 lkflags |= LK_NOWAIT; 4764 objflags = OBJPC_NOSYNC; 4765 } else { 4766 objflags = OBJPC_SYNC; 4767 } 4768 4769 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 4770 seen_defer = false; 4771 if (vp->v_iflag & VI_DEFINACT) { 4772 vp->v_iflag &= ~VI_DEFINACT; 4773 seen_defer = true; 4774 } 4775 if (!vfs_want_msync(vp)) { 4776 if (seen_defer) 4777 vfs_deferred_inactive(vp, lkflags); 4778 else 4779 VI_UNLOCK(vp); 4780 continue; 4781 } 4782 if (vget(vp, lkflags) == 0) { 4783 obj = vp->v_object; 4784 if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { 4785 VM_OBJECT_WLOCK(obj); 4786 vm_object_page_clean(obj, 0, 0, objflags); 4787 VM_OBJECT_WUNLOCK(obj); 4788 } 4789 vput(vp); 4790 if (seen_defer) 4791 vdrop(vp); 4792 } else { 4793 if (seen_defer) 4794 vdefer_inactive_unlocked(vp); 4795 } 4796 } 4797 } 4798 4799 void 4800 vfs_periodic(struct mount *mp, int flags) 4801 { 4802 4803 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 4804 4805 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 4806 vfs_periodic_inactive(mp, flags); 4807 else 4808 vfs_periodic_msync_inactive(mp, flags); 4809 } 4810 4811 static void 4812 destroy_vpollinfo_free(struct vpollinfo *vi) 4813 { 4814 4815 knlist_destroy(&vi->vpi_selinfo.si_note); 4816 mtx_destroy(&vi->vpi_lock); 4817 free(vi, M_VNODEPOLL); 4818 } 4819 4820 static void 4821 destroy_vpollinfo(struct vpollinfo *vi) 4822 { 4823 4824 knlist_clear(&vi->vpi_selinfo.si_note, 1); 4825 seldrain(&vi->vpi_selinfo); 4826 destroy_vpollinfo_free(vi); 4827 } 4828 4829 /* 4830 * Initialize per-vnode helper structure to hold poll-related state. 4831 */ 4832 void 4833 v_addpollinfo(struct vnode *vp) 4834 { 4835 struct vpollinfo *vi; 4836 4837 if (vp->v_pollinfo != NULL) 4838 return; 4839 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); 4840 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 4841 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 4842 vfs_knlunlock, vfs_knl_assert_lock); 4843 VI_LOCK(vp); 4844 if (vp->v_pollinfo != NULL) { 4845 VI_UNLOCK(vp); 4846 destroy_vpollinfo_free(vi); 4847 return; 4848 } 4849 vp->v_pollinfo = vi; 4850 VI_UNLOCK(vp); 4851 } 4852 4853 /* 4854 * Record a process's interest in events which might happen to 4855 * a vnode. Because poll uses the historic select-style interface 4856 * internally, this routine serves as both the ``check for any 4857 * pending events'' and the ``record my interest in future events'' 4858 * functions. (These are done together, while the lock is held, 4859 * to avoid race conditions.) 4860 */ 4861 int 4862 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 4863 { 4864 4865 v_addpollinfo(vp); 4866 mtx_lock(&vp->v_pollinfo->vpi_lock); 4867 if (vp->v_pollinfo->vpi_revents & events) { 4868 /* 4869 * This leaves events we are not interested 4870 * in available for the other process which 4871 * which presumably had requested them 4872 * (otherwise they would never have been 4873 * recorded). 4874 */ 4875 events &= vp->v_pollinfo->vpi_revents; 4876 vp->v_pollinfo->vpi_revents &= ~events; 4877 4878 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4879 return (events); 4880 } 4881 vp->v_pollinfo->vpi_events |= events; 4882 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 4883 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4884 return (0); 4885 } 4886 4887 /* 4888 * Routine to create and manage a filesystem syncer vnode. 4889 */ 4890 #define sync_close ((int (*)(struct vop_close_args *))nullop) 4891 static int sync_fsync(struct vop_fsync_args *); 4892 static int sync_inactive(struct vop_inactive_args *); 4893 static int sync_reclaim(struct vop_reclaim_args *); 4894 4895 static struct vop_vector sync_vnodeops = { 4896 .vop_bypass = VOP_EOPNOTSUPP, 4897 .vop_close = sync_close, /* close */ 4898 .vop_fsync = sync_fsync, /* fsync */ 4899 .vop_inactive = sync_inactive, /* inactive */ 4900 .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ 4901 .vop_reclaim = sync_reclaim, /* reclaim */ 4902 .vop_lock1 = vop_stdlock, /* lock */ 4903 .vop_unlock = vop_stdunlock, /* unlock */ 4904 .vop_islocked = vop_stdislocked, /* islocked */ 4905 }; 4906 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 4907 4908 /* 4909 * Create a new filesystem syncer vnode for the specified mount point. 4910 */ 4911 void 4912 vfs_allocate_syncvnode(struct mount *mp) 4913 { 4914 struct vnode *vp; 4915 struct bufobj *bo; 4916 static long start, incr, next; 4917 int error; 4918 4919 /* Allocate a new vnode */ 4920 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 4921 if (error != 0) 4922 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 4923 vp->v_type = VNON; 4924 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4925 vp->v_vflag |= VV_FORCEINSMQ; 4926 error = insmntque(vp, mp); 4927 if (error != 0) 4928 panic("vfs_allocate_syncvnode: insmntque() failed"); 4929 vp->v_vflag &= ~VV_FORCEINSMQ; 4930 VOP_UNLOCK(vp); 4931 /* 4932 * Place the vnode onto the syncer worklist. We attempt to 4933 * scatter them about on the list so that they will go off 4934 * at evenly distributed times even if all the filesystems 4935 * are mounted at once. 4936 */ 4937 next += incr; 4938 if (next == 0 || next > syncer_maxdelay) { 4939 start /= 2; 4940 incr /= 2; 4941 if (start == 0) { 4942 start = syncer_maxdelay / 2; 4943 incr = syncer_maxdelay; 4944 } 4945 next = start; 4946 } 4947 bo = &vp->v_bufobj; 4948 BO_LOCK(bo); 4949 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 4950 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 4951 mtx_lock(&sync_mtx); 4952 sync_vnode_count++; 4953 if (mp->mnt_syncer == NULL) { 4954 mp->mnt_syncer = vp; 4955 vp = NULL; 4956 } 4957 mtx_unlock(&sync_mtx); 4958 BO_UNLOCK(bo); 4959 if (vp != NULL) { 4960 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4961 vgone(vp); 4962 vput(vp); 4963 } 4964 } 4965 4966 void 4967 vfs_deallocate_syncvnode(struct mount *mp) 4968 { 4969 struct vnode *vp; 4970 4971 mtx_lock(&sync_mtx); 4972 vp = mp->mnt_syncer; 4973 if (vp != NULL) 4974 mp->mnt_syncer = NULL; 4975 mtx_unlock(&sync_mtx); 4976 if (vp != NULL) 4977 vrele(vp); 4978 } 4979 4980 /* 4981 * Do a lazy sync of the filesystem. 4982 */ 4983 static int 4984 sync_fsync(struct vop_fsync_args *ap) 4985 { 4986 struct vnode *syncvp = ap->a_vp; 4987 struct mount *mp = syncvp->v_mount; 4988 int error, save; 4989 struct bufobj *bo; 4990 4991 /* 4992 * We only need to do something if this is a lazy evaluation. 4993 */ 4994 if (ap->a_waitfor != MNT_LAZY) 4995 return (0); 4996 4997 /* 4998 * Move ourselves to the back of the sync list. 4999 */ 5000 bo = &syncvp->v_bufobj; 5001 BO_LOCK(bo); 5002 vn_syncer_add_to_worklist(bo, syncdelay); 5003 BO_UNLOCK(bo); 5004 5005 /* 5006 * Walk the list of vnodes pushing all that are dirty and 5007 * not already on the sync list. 5008 */ 5009 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5010 return (0); 5011 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 5012 vfs_unbusy(mp); 5013 return (0); 5014 } 5015 save = curthread_pflags_set(TDP_SYNCIO); 5016 /* 5017 * The filesystem at hand may be idle with free vnodes stored in the 5018 * batch. Return them instead of letting them stay there indefinitely. 5019 */ 5020 vfs_periodic(mp, MNT_NOWAIT); 5021 error = VFS_SYNC(mp, MNT_LAZY); 5022 curthread_pflags_restore(save); 5023 vn_finished_write(mp); 5024 vfs_unbusy(mp); 5025 return (error); 5026 } 5027 5028 /* 5029 * The syncer vnode is no referenced. 5030 */ 5031 static int 5032 sync_inactive(struct vop_inactive_args *ap) 5033 { 5034 5035 vgone(ap->a_vp); 5036 return (0); 5037 } 5038 5039 /* 5040 * The syncer vnode is no longer needed and is being decommissioned. 5041 * 5042 * Modifications to the worklist must be protected by sync_mtx. 5043 */ 5044 static int 5045 sync_reclaim(struct vop_reclaim_args *ap) 5046 { 5047 struct vnode *vp = ap->a_vp; 5048 struct bufobj *bo; 5049 5050 bo = &vp->v_bufobj; 5051 BO_LOCK(bo); 5052 mtx_lock(&sync_mtx); 5053 if (vp->v_mount->mnt_syncer == vp) 5054 vp->v_mount->mnt_syncer = NULL; 5055 if (bo->bo_flag & BO_ONWORKLST) { 5056 LIST_REMOVE(bo, bo_synclist); 5057 syncer_worklist_len--; 5058 sync_vnode_count--; 5059 bo->bo_flag &= ~BO_ONWORKLST; 5060 } 5061 mtx_unlock(&sync_mtx); 5062 BO_UNLOCK(bo); 5063 5064 return (0); 5065 } 5066 5067 int 5068 vn_need_pageq_flush(struct vnode *vp) 5069 { 5070 struct vm_object *obj; 5071 int need; 5072 5073 MPASS(mtx_owned(VI_MTX(vp))); 5074 need = 0; 5075 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5076 vm_object_mightbedirty(obj)) 5077 need = 1; 5078 return (need); 5079 } 5080 5081 /* 5082 * Check if vnode represents a disk device 5083 */ 5084 bool 5085 vn_isdisk_error(struct vnode *vp, int *errp) 5086 { 5087 int error; 5088 5089 if (vp->v_type != VCHR) { 5090 error = ENOTBLK; 5091 goto out; 5092 } 5093 error = 0; 5094 dev_lock(); 5095 if (vp->v_rdev == NULL) 5096 error = ENXIO; 5097 else if (vp->v_rdev->si_devsw == NULL) 5098 error = ENXIO; 5099 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5100 error = ENOTBLK; 5101 dev_unlock(); 5102 out: 5103 *errp = error; 5104 return (error == 0); 5105 } 5106 5107 bool 5108 vn_isdisk(struct vnode *vp) 5109 { 5110 int error; 5111 5112 return (vn_isdisk_error(vp, &error)); 5113 } 5114 5115 /* 5116 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5117 * the comment above cache_fplookup for details. 5118 */ 5119 int 5120 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5121 { 5122 int error; 5123 5124 VFS_SMR_ASSERT_ENTERED(); 5125 5126 /* Check the owner. */ 5127 if (cred->cr_uid == file_uid) { 5128 if (file_mode & S_IXUSR) 5129 return (0); 5130 goto out_error; 5131 } 5132 5133 /* Otherwise, check the groups (first match) */ 5134 if (groupmember(file_gid, cred)) { 5135 if (file_mode & S_IXGRP) 5136 return (0); 5137 goto out_error; 5138 } 5139 5140 /* Otherwise, check everyone else. */ 5141 if (file_mode & S_IXOTH) 5142 return (0); 5143 out_error: 5144 /* 5145 * Permission check failed, but it is possible denial will get overwritten 5146 * (e.g., when root is traversing through a 700 directory owned by someone 5147 * else). 5148 * 5149 * vaccess() calls priv_check_cred which in turn can descent into MAC 5150 * modules overriding this result. It's quite unclear what semantics 5151 * are allowed for them to operate, thus for safety we don't call them 5152 * from within the SMR section. This also means if any such modules 5153 * are present, we have to let the regular lookup decide. 5154 */ 5155 error = priv_check_cred_vfs_lookup_nomac(cred); 5156 switch (error) { 5157 case 0: 5158 return (0); 5159 case EAGAIN: 5160 /* 5161 * MAC modules present. 5162 */ 5163 return (EAGAIN); 5164 case EPERM: 5165 return (EACCES); 5166 default: 5167 return (error); 5168 } 5169 } 5170 5171 /* 5172 * Common filesystem object access control check routine. Accepts a 5173 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5174 * Returns 0 on success, or an errno on failure. 5175 */ 5176 int 5177 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5178 accmode_t accmode, struct ucred *cred) 5179 { 5180 accmode_t dac_granted; 5181 accmode_t priv_granted; 5182 5183 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5184 ("invalid bit in accmode")); 5185 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5186 ("VAPPEND without VWRITE")); 5187 5188 /* 5189 * Look for a normal, non-privileged way to access the file/directory 5190 * as requested. If it exists, go with that. 5191 */ 5192 5193 dac_granted = 0; 5194 5195 /* Check the owner. */ 5196 if (cred->cr_uid == file_uid) { 5197 dac_granted |= VADMIN; 5198 if (file_mode & S_IXUSR) 5199 dac_granted |= VEXEC; 5200 if (file_mode & S_IRUSR) 5201 dac_granted |= VREAD; 5202 if (file_mode & S_IWUSR) 5203 dac_granted |= (VWRITE | VAPPEND); 5204 5205 if ((accmode & dac_granted) == accmode) 5206 return (0); 5207 5208 goto privcheck; 5209 } 5210 5211 /* Otherwise, check the groups (first match) */ 5212 if (groupmember(file_gid, cred)) { 5213 if (file_mode & S_IXGRP) 5214 dac_granted |= VEXEC; 5215 if (file_mode & S_IRGRP) 5216 dac_granted |= VREAD; 5217 if (file_mode & S_IWGRP) 5218 dac_granted |= (VWRITE | VAPPEND); 5219 5220 if ((accmode & dac_granted) == accmode) 5221 return (0); 5222 5223 goto privcheck; 5224 } 5225 5226 /* Otherwise, check everyone else. */ 5227 if (file_mode & S_IXOTH) 5228 dac_granted |= VEXEC; 5229 if (file_mode & S_IROTH) 5230 dac_granted |= VREAD; 5231 if (file_mode & S_IWOTH) 5232 dac_granted |= (VWRITE | VAPPEND); 5233 if ((accmode & dac_granted) == accmode) 5234 return (0); 5235 5236 privcheck: 5237 /* 5238 * Build a privilege mask to determine if the set of privileges 5239 * satisfies the requirements when combined with the granted mask 5240 * from above. For each privilege, if the privilege is required, 5241 * bitwise or the request type onto the priv_granted mask. 5242 */ 5243 priv_granted = 0; 5244 5245 if (type == VDIR) { 5246 /* 5247 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5248 * requests, instead of PRIV_VFS_EXEC. 5249 */ 5250 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5251 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5252 priv_granted |= VEXEC; 5253 } else { 5254 /* 5255 * Ensure that at least one execute bit is on. Otherwise, 5256 * a privileged user will always succeed, and we don't want 5257 * this to happen unless the file really is executable. 5258 */ 5259 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5260 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5261 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5262 priv_granted |= VEXEC; 5263 } 5264 5265 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5266 !priv_check_cred(cred, PRIV_VFS_READ)) 5267 priv_granted |= VREAD; 5268 5269 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5270 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5271 priv_granted |= (VWRITE | VAPPEND); 5272 5273 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5274 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5275 priv_granted |= VADMIN; 5276 5277 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5278 return (0); 5279 } 5280 5281 return ((accmode & VADMIN) ? EPERM : EACCES); 5282 } 5283 5284 /* 5285 * Credential check based on process requesting service, and per-attribute 5286 * permissions. 5287 */ 5288 int 5289 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5290 struct thread *td, accmode_t accmode) 5291 { 5292 5293 /* 5294 * Kernel-invoked always succeeds. 5295 */ 5296 if (cred == NOCRED) 5297 return (0); 5298 5299 /* 5300 * Do not allow privileged processes in jail to directly manipulate 5301 * system attributes. 5302 */ 5303 switch (attrnamespace) { 5304 case EXTATTR_NAMESPACE_SYSTEM: 5305 /* Potentially should be: return (EPERM); */ 5306 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5307 case EXTATTR_NAMESPACE_USER: 5308 return (VOP_ACCESS(vp, accmode, cred, td)); 5309 default: 5310 return (EPERM); 5311 } 5312 } 5313 5314 #ifdef DEBUG_VFS_LOCKS 5315 /* 5316 * This only exists to suppress warnings from unlocked specfs accesses. It is 5317 * no longer ok to have an unlocked VFS. 5318 */ 5319 #define IGNORE_LOCK(vp) (KERNEL_PANICKED() || (vp) == NULL || \ 5320 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 5321 5322 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5323 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5324 "Drop into debugger on lock violation"); 5325 5326 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5327 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5328 0, "Check for interlock across VOPs"); 5329 5330 int vfs_badlock_print = 1; /* Print lock violations. */ 5331 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5332 0, "Print lock violations"); 5333 5334 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5335 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5336 0, "Print vnode details on lock violations"); 5337 5338 #ifdef KDB 5339 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5340 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5341 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5342 #endif 5343 5344 static void 5345 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5346 { 5347 5348 #ifdef KDB 5349 if (vfs_badlock_backtrace) 5350 kdb_backtrace(); 5351 #endif 5352 if (vfs_badlock_vnode) 5353 vn_printf(vp, "vnode "); 5354 if (vfs_badlock_print) 5355 printf("%s: %p %s\n", str, (void *)vp, msg); 5356 if (vfs_badlock_ddb) 5357 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5358 } 5359 5360 void 5361 assert_vi_locked(struct vnode *vp, const char *str) 5362 { 5363 5364 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5365 vfs_badlock("interlock is not locked but should be", str, vp); 5366 } 5367 5368 void 5369 assert_vi_unlocked(struct vnode *vp, const char *str) 5370 { 5371 5372 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5373 vfs_badlock("interlock is locked but should not be", str, vp); 5374 } 5375 5376 void 5377 assert_vop_locked(struct vnode *vp, const char *str) 5378 { 5379 int locked; 5380 5381 if (!IGNORE_LOCK(vp)) { 5382 locked = VOP_ISLOCKED(vp); 5383 if (locked == 0 || locked == LK_EXCLOTHER) 5384 vfs_badlock("is not locked but should be", str, vp); 5385 } 5386 } 5387 5388 void 5389 assert_vop_unlocked(struct vnode *vp, const char *str) 5390 { 5391 5392 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5393 vfs_badlock("is locked but should not be", str, vp); 5394 } 5395 5396 void 5397 assert_vop_elocked(struct vnode *vp, const char *str) 5398 { 5399 5400 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5401 vfs_badlock("is not exclusive locked but should be", str, vp); 5402 } 5403 #endif /* DEBUG_VFS_LOCKS */ 5404 5405 void 5406 vop_rename_fail(struct vop_rename_args *ap) 5407 { 5408 5409 if (ap->a_tvp != NULL) 5410 vput(ap->a_tvp); 5411 if (ap->a_tdvp == ap->a_tvp) 5412 vrele(ap->a_tdvp); 5413 else 5414 vput(ap->a_tdvp); 5415 vrele(ap->a_fdvp); 5416 vrele(ap->a_fvp); 5417 } 5418 5419 void 5420 vop_rename_pre(void *ap) 5421 { 5422 struct vop_rename_args *a = ap; 5423 5424 #ifdef DEBUG_VFS_LOCKS 5425 if (a->a_tvp) 5426 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5427 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5428 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5429 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5430 5431 /* Check the source (from). */ 5432 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5433 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5434 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5435 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5436 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5437 5438 /* Check the target. */ 5439 if (a->a_tvp) 5440 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5441 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5442 #endif 5443 /* 5444 * It may be tempting to add vn_seqc_write_begin/end calls here and 5445 * in vop_rename_post but that's not going to work out since some 5446 * filesystems relookup vnodes mid-rename. This is probably a bug. 5447 * 5448 * For now filesystems are expected to do the relevant calls after they 5449 * decide what vnodes to operate on. 5450 */ 5451 if (a->a_tdvp != a->a_fdvp) 5452 vhold(a->a_fdvp); 5453 if (a->a_tvp != a->a_fvp) 5454 vhold(a->a_fvp); 5455 vhold(a->a_tdvp); 5456 if (a->a_tvp) 5457 vhold(a->a_tvp); 5458 } 5459 5460 #ifdef DEBUG_VFS_LOCKS 5461 void 5462 vop_fplookup_vexec_debugpre(void *ap __unused) 5463 { 5464 5465 VFS_SMR_ASSERT_ENTERED(); 5466 } 5467 5468 void 5469 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) 5470 { 5471 5472 VFS_SMR_ASSERT_ENTERED(); 5473 } 5474 5475 void 5476 vop_fplookup_symlink_debugpre(void *ap __unused) 5477 { 5478 5479 VFS_SMR_ASSERT_ENTERED(); 5480 } 5481 5482 void 5483 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) 5484 { 5485 5486 VFS_SMR_ASSERT_ENTERED(); 5487 } 5488 void 5489 vop_strategy_debugpre(void *ap) 5490 { 5491 struct vop_strategy_args *a; 5492 struct buf *bp; 5493 5494 a = ap; 5495 bp = a->a_bp; 5496 5497 /* 5498 * Cluster ops lock their component buffers but not the IO container. 5499 */ 5500 if ((bp->b_flags & B_CLUSTER) != 0) 5501 return; 5502 5503 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 5504 if (vfs_badlock_print) 5505 printf( 5506 "VOP_STRATEGY: bp is not locked but should be\n"); 5507 if (vfs_badlock_ddb) 5508 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5509 } 5510 } 5511 5512 void 5513 vop_lock_debugpre(void *ap) 5514 { 5515 struct vop_lock1_args *a = ap; 5516 5517 if ((a->a_flags & LK_INTERLOCK) == 0) 5518 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5519 else 5520 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 5521 } 5522 5523 void 5524 vop_lock_debugpost(void *ap, int rc) 5525 { 5526 struct vop_lock1_args *a = ap; 5527 5528 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5529 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 5530 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 5531 } 5532 5533 void 5534 vop_unlock_debugpre(void *ap) 5535 { 5536 struct vop_unlock_args *a = ap; 5537 5538 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 5539 } 5540 5541 void 5542 vop_need_inactive_debugpre(void *ap) 5543 { 5544 struct vop_need_inactive_args *a = ap; 5545 5546 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5547 } 5548 5549 void 5550 vop_need_inactive_debugpost(void *ap, int rc) 5551 { 5552 struct vop_need_inactive_args *a = ap; 5553 5554 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5555 } 5556 #endif 5557 5558 void 5559 vop_create_pre(void *ap) 5560 { 5561 struct vop_create_args *a; 5562 struct vnode *dvp; 5563 5564 a = ap; 5565 dvp = a->a_dvp; 5566 vn_seqc_write_begin(dvp); 5567 } 5568 5569 void 5570 vop_create_post(void *ap, int rc) 5571 { 5572 struct vop_create_args *a; 5573 struct vnode *dvp; 5574 5575 a = ap; 5576 dvp = a->a_dvp; 5577 vn_seqc_write_end(dvp); 5578 if (!rc) 5579 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5580 } 5581 5582 void 5583 vop_whiteout_pre(void *ap) 5584 { 5585 struct vop_whiteout_args *a; 5586 struct vnode *dvp; 5587 5588 a = ap; 5589 dvp = a->a_dvp; 5590 vn_seqc_write_begin(dvp); 5591 } 5592 5593 void 5594 vop_whiteout_post(void *ap, int rc) 5595 { 5596 struct vop_whiteout_args *a; 5597 struct vnode *dvp; 5598 5599 a = ap; 5600 dvp = a->a_dvp; 5601 vn_seqc_write_end(dvp); 5602 } 5603 5604 void 5605 vop_deleteextattr_pre(void *ap) 5606 { 5607 struct vop_deleteextattr_args *a; 5608 struct vnode *vp; 5609 5610 a = ap; 5611 vp = a->a_vp; 5612 vn_seqc_write_begin(vp); 5613 } 5614 5615 void 5616 vop_deleteextattr_post(void *ap, int rc) 5617 { 5618 struct vop_deleteextattr_args *a; 5619 struct vnode *vp; 5620 5621 a = ap; 5622 vp = a->a_vp; 5623 vn_seqc_write_end(vp); 5624 if (!rc) 5625 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5626 } 5627 5628 void 5629 vop_link_pre(void *ap) 5630 { 5631 struct vop_link_args *a; 5632 struct vnode *vp, *tdvp; 5633 5634 a = ap; 5635 vp = a->a_vp; 5636 tdvp = a->a_tdvp; 5637 vn_seqc_write_begin(vp); 5638 vn_seqc_write_begin(tdvp); 5639 } 5640 5641 void 5642 vop_link_post(void *ap, int rc) 5643 { 5644 struct vop_link_args *a; 5645 struct vnode *vp, *tdvp; 5646 5647 a = ap; 5648 vp = a->a_vp; 5649 tdvp = a->a_tdvp; 5650 vn_seqc_write_end(vp); 5651 vn_seqc_write_end(tdvp); 5652 if (!rc) { 5653 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 5654 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 5655 } 5656 } 5657 5658 void 5659 vop_mkdir_pre(void *ap) 5660 { 5661 struct vop_mkdir_args *a; 5662 struct vnode *dvp; 5663 5664 a = ap; 5665 dvp = a->a_dvp; 5666 vn_seqc_write_begin(dvp); 5667 } 5668 5669 void 5670 vop_mkdir_post(void *ap, int rc) 5671 { 5672 struct vop_mkdir_args *a; 5673 struct vnode *dvp; 5674 5675 a = ap; 5676 dvp = a->a_dvp; 5677 vn_seqc_write_end(dvp); 5678 if (!rc) 5679 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5680 } 5681 5682 #ifdef DEBUG_VFS_LOCKS 5683 void 5684 vop_mkdir_debugpost(void *ap, int rc) 5685 { 5686 struct vop_mkdir_args *a; 5687 5688 a = ap; 5689 if (!rc) 5690 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); 5691 } 5692 #endif 5693 5694 void 5695 vop_mknod_pre(void *ap) 5696 { 5697 struct vop_mknod_args *a; 5698 struct vnode *dvp; 5699 5700 a = ap; 5701 dvp = a->a_dvp; 5702 vn_seqc_write_begin(dvp); 5703 } 5704 5705 void 5706 vop_mknod_post(void *ap, int rc) 5707 { 5708 struct vop_mknod_args *a; 5709 struct vnode *dvp; 5710 5711 a = ap; 5712 dvp = a->a_dvp; 5713 vn_seqc_write_end(dvp); 5714 if (!rc) 5715 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5716 } 5717 5718 void 5719 vop_reclaim_post(void *ap, int rc) 5720 { 5721 struct vop_reclaim_args *a; 5722 struct vnode *vp; 5723 5724 a = ap; 5725 vp = a->a_vp; 5726 ASSERT_VOP_IN_SEQC(vp); 5727 if (!rc) 5728 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 5729 } 5730 5731 void 5732 vop_remove_pre(void *ap) 5733 { 5734 struct vop_remove_args *a; 5735 struct vnode *dvp, *vp; 5736 5737 a = ap; 5738 dvp = a->a_dvp; 5739 vp = a->a_vp; 5740 vn_seqc_write_begin(dvp); 5741 vn_seqc_write_begin(vp); 5742 } 5743 5744 void 5745 vop_remove_post(void *ap, int rc) 5746 { 5747 struct vop_remove_args *a; 5748 struct vnode *dvp, *vp; 5749 5750 a = ap; 5751 dvp = a->a_dvp; 5752 vp = a->a_vp; 5753 vn_seqc_write_end(dvp); 5754 vn_seqc_write_end(vp); 5755 if (!rc) { 5756 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5757 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5758 } 5759 } 5760 5761 void 5762 vop_rename_post(void *ap, int rc) 5763 { 5764 struct vop_rename_args *a = ap; 5765 long hint; 5766 5767 if (!rc) { 5768 hint = NOTE_WRITE; 5769 if (a->a_fdvp == a->a_tdvp) { 5770 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 5771 hint |= NOTE_LINK; 5772 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5773 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5774 } else { 5775 hint |= NOTE_EXTEND; 5776 if (a->a_fvp->v_type == VDIR) 5777 hint |= NOTE_LINK; 5778 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5779 5780 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 5781 a->a_tvp->v_type == VDIR) 5782 hint &= ~NOTE_LINK; 5783 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5784 } 5785 5786 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 5787 if (a->a_tvp) 5788 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 5789 } 5790 if (a->a_tdvp != a->a_fdvp) 5791 vdrop(a->a_fdvp); 5792 if (a->a_tvp != a->a_fvp) 5793 vdrop(a->a_fvp); 5794 vdrop(a->a_tdvp); 5795 if (a->a_tvp) 5796 vdrop(a->a_tvp); 5797 } 5798 5799 void 5800 vop_rmdir_pre(void *ap) 5801 { 5802 struct vop_rmdir_args *a; 5803 struct vnode *dvp, *vp; 5804 5805 a = ap; 5806 dvp = a->a_dvp; 5807 vp = a->a_vp; 5808 vn_seqc_write_begin(dvp); 5809 vn_seqc_write_begin(vp); 5810 } 5811 5812 void 5813 vop_rmdir_post(void *ap, int rc) 5814 { 5815 struct vop_rmdir_args *a; 5816 struct vnode *dvp, *vp; 5817 5818 a = ap; 5819 dvp = a->a_dvp; 5820 vp = a->a_vp; 5821 vn_seqc_write_end(dvp); 5822 vn_seqc_write_end(vp); 5823 if (!rc) { 5824 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5825 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5826 } 5827 } 5828 5829 void 5830 vop_setattr_pre(void *ap) 5831 { 5832 struct vop_setattr_args *a; 5833 struct vnode *vp; 5834 5835 a = ap; 5836 vp = a->a_vp; 5837 vn_seqc_write_begin(vp); 5838 } 5839 5840 void 5841 vop_setattr_post(void *ap, int rc) 5842 { 5843 struct vop_setattr_args *a; 5844 struct vnode *vp; 5845 5846 a = ap; 5847 vp = a->a_vp; 5848 vn_seqc_write_end(vp); 5849 if (!rc) 5850 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 5851 } 5852 5853 void 5854 vop_setacl_pre(void *ap) 5855 { 5856 struct vop_setacl_args *a; 5857 struct vnode *vp; 5858 5859 a = ap; 5860 vp = a->a_vp; 5861 vn_seqc_write_begin(vp); 5862 } 5863 5864 void 5865 vop_setacl_post(void *ap, int rc __unused) 5866 { 5867 struct vop_setacl_args *a; 5868 struct vnode *vp; 5869 5870 a = ap; 5871 vp = a->a_vp; 5872 vn_seqc_write_end(vp); 5873 } 5874 5875 void 5876 vop_setextattr_pre(void *ap) 5877 { 5878 struct vop_setextattr_args *a; 5879 struct vnode *vp; 5880 5881 a = ap; 5882 vp = a->a_vp; 5883 vn_seqc_write_begin(vp); 5884 } 5885 5886 void 5887 vop_setextattr_post(void *ap, int rc) 5888 { 5889 struct vop_setextattr_args *a; 5890 struct vnode *vp; 5891 5892 a = ap; 5893 vp = a->a_vp; 5894 vn_seqc_write_end(vp); 5895 if (!rc) 5896 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 5897 } 5898 5899 void 5900 vop_symlink_pre(void *ap) 5901 { 5902 struct vop_symlink_args *a; 5903 struct vnode *dvp; 5904 5905 a = ap; 5906 dvp = a->a_dvp; 5907 vn_seqc_write_begin(dvp); 5908 } 5909 5910 void 5911 vop_symlink_post(void *ap, int rc) 5912 { 5913 struct vop_symlink_args *a; 5914 struct vnode *dvp; 5915 5916 a = ap; 5917 dvp = a->a_dvp; 5918 vn_seqc_write_end(dvp); 5919 if (!rc) 5920 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5921 } 5922 5923 void 5924 vop_open_post(void *ap, int rc) 5925 { 5926 struct vop_open_args *a = ap; 5927 5928 if (!rc) 5929 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 5930 } 5931 5932 void 5933 vop_close_post(void *ap, int rc) 5934 { 5935 struct vop_close_args *a = ap; 5936 5937 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 5938 !VN_IS_DOOMED(a->a_vp))) { 5939 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 5940 NOTE_CLOSE_WRITE : NOTE_CLOSE); 5941 } 5942 } 5943 5944 void 5945 vop_read_post(void *ap, int rc) 5946 { 5947 struct vop_read_args *a = ap; 5948 5949 if (!rc) 5950 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 5951 } 5952 5953 void 5954 vop_read_pgcache_post(void *ap, int rc) 5955 { 5956 struct vop_read_pgcache_args *a = ap; 5957 5958 if (!rc) 5959 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); 5960 } 5961 5962 void 5963 vop_readdir_post(void *ap, int rc) 5964 { 5965 struct vop_readdir_args *a = ap; 5966 5967 if (!rc) 5968 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 5969 } 5970 5971 static struct knlist fs_knlist; 5972 5973 static void 5974 vfs_event_init(void *arg) 5975 { 5976 knlist_init_mtx(&fs_knlist, NULL); 5977 } 5978 /* XXX - correct order? */ 5979 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 5980 5981 void 5982 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 5983 { 5984 5985 KNOTE_UNLOCKED(&fs_knlist, event); 5986 } 5987 5988 static int filt_fsattach(struct knote *kn); 5989 static void filt_fsdetach(struct knote *kn); 5990 static int filt_fsevent(struct knote *kn, long hint); 5991 5992 struct filterops fs_filtops = { 5993 .f_isfd = 0, 5994 .f_attach = filt_fsattach, 5995 .f_detach = filt_fsdetach, 5996 .f_event = filt_fsevent 5997 }; 5998 5999 static int 6000 filt_fsattach(struct knote *kn) 6001 { 6002 6003 kn->kn_flags |= EV_CLEAR; 6004 knlist_add(&fs_knlist, kn, 0); 6005 return (0); 6006 } 6007 6008 static void 6009 filt_fsdetach(struct knote *kn) 6010 { 6011 6012 knlist_remove(&fs_knlist, kn, 0); 6013 } 6014 6015 static int 6016 filt_fsevent(struct knote *kn, long hint) 6017 { 6018 6019 kn->kn_fflags |= kn->kn_sfflags & hint; 6020 6021 return (kn->kn_fflags != 0); 6022 } 6023 6024 static int 6025 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6026 { 6027 struct vfsidctl vc; 6028 int error; 6029 struct mount *mp; 6030 6031 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6032 if (error) 6033 return (error); 6034 if (vc.vc_vers != VFS_CTL_VERS1) 6035 return (EINVAL); 6036 mp = vfs_getvfs(&vc.vc_fsid); 6037 if (mp == NULL) 6038 return (ENOENT); 6039 /* ensure that a specific sysctl goes to the right filesystem. */ 6040 if (strcmp(vc.vc_fstypename, "*") != 0 && 6041 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6042 vfs_rel(mp); 6043 return (EINVAL); 6044 } 6045 VCTLTOREQ(&vc, req); 6046 error = VFS_SYSCTL(mp, vc.vc_op, req); 6047 vfs_rel(mp); 6048 return (error); 6049 } 6050 6051 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6052 NULL, 0, sysctl_vfs_ctl, "", 6053 "Sysctl by fsid"); 6054 6055 /* 6056 * Function to initialize a va_filerev field sensibly. 6057 * XXX: Wouldn't a random number make a lot more sense ?? 6058 */ 6059 u_quad_t 6060 init_va_filerev(void) 6061 { 6062 struct bintime bt; 6063 6064 getbinuptime(&bt); 6065 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6066 } 6067 6068 static int filt_vfsread(struct knote *kn, long hint); 6069 static int filt_vfswrite(struct knote *kn, long hint); 6070 static int filt_vfsvnode(struct knote *kn, long hint); 6071 static void filt_vfsdetach(struct knote *kn); 6072 static struct filterops vfsread_filtops = { 6073 .f_isfd = 1, 6074 .f_detach = filt_vfsdetach, 6075 .f_event = filt_vfsread 6076 }; 6077 static struct filterops vfswrite_filtops = { 6078 .f_isfd = 1, 6079 .f_detach = filt_vfsdetach, 6080 .f_event = filt_vfswrite 6081 }; 6082 static struct filterops vfsvnode_filtops = { 6083 .f_isfd = 1, 6084 .f_detach = filt_vfsdetach, 6085 .f_event = filt_vfsvnode 6086 }; 6087 6088 static void 6089 vfs_knllock(void *arg) 6090 { 6091 struct vnode *vp = arg; 6092 6093 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6094 } 6095 6096 static void 6097 vfs_knlunlock(void *arg) 6098 { 6099 struct vnode *vp = arg; 6100 6101 VOP_UNLOCK(vp); 6102 } 6103 6104 static void 6105 vfs_knl_assert_lock(void *arg, int what) 6106 { 6107 #ifdef DEBUG_VFS_LOCKS 6108 struct vnode *vp = arg; 6109 6110 if (what == LA_LOCKED) 6111 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6112 else 6113 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6114 #endif 6115 } 6116 6117 int 6118 vfs_kqfilter(struct vop_kqfilter_args *ap) 6119 { 6120 struct vnode *vp = ap->a_vp; 6121 struct knote *kn = ap->a_kn; 6122 struct knlist *knl; 6123 6124 switch (kn->kn_filter) { 6125 case EVFILT_READ: 6126 kn->kn_fop = &vfsread_filtops; 6127 break; 6128 case EVFILT_WRITE: 6129 kn->kn_fop = &vfswrite_filtops; 6130 break; 6131 case EVFILT_VNODE: 6132 kn->kn_fop = &vfsvnode_filtops; 6133 break; 6134 default: 6135 return (EINVAL); 6136 } 6137 6138 kn->kn_hook = (caddr_t)vp; 6139 6140 v_addpollinfo(vp); 6141 if (vp->v_pollinfo == NULL) 6142 return (ENOMEM); 6143 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6144 vhold(vp); 6145 knlist_add(knl, kn, 0); 6146 6147 return (0); 6148 } 6149 6150 /* 6151 * Detach knote from vnode 6152 */ 6153 static void 6154 filt_vfsdetach(struct knote *kn) 6155 { 6156 struct vnode *vp = (struct vnode *)kn->kn_hook; 6157 6158 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6159 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6160 vdrop(vp); 6161 } 6162 6163 /*ARGSUSED*/ 6164 static int 6165 filt_vfsread(struct knote *kn, long hint) 6166 { 6167 struct vnode *vp = (struct vnode *)kn->kn_hook; 6168 struct vattr va; 6169 int res; 6170 6171 /* 6172 * filesystem is gone, so set the EOF flag and schedule 6173 * the knote for deletion. 6174 */ 6175 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6176 VI_LOCK(vp); 6177 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6178 VI_UNLOCK(vp); 6179 return (1); 6180 } 6181 6182 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 6183 return (0); 6184 6185 VI_LOCK(vp); 6186 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 6187 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6188 VI_UNLOCK(vp); 6189 return (res); 6190 } 6191 6192 /*ARGSUSED*/ 6193 static int 6194 filt_vfswrite(struct knote *kn, long hint) 6195 { 6196 struct vnode *vp = (struct vnode *)kn->kn_hook; 6197 6198 VI_LOCK(vp); 6199 6200 /* 6201 * filesystem is gone, so set the EOF flag and schedule 6202 * the knote for deletion. 6203 */ 6204 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6205 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6206 6207 kn->kn_data = 0; 6208 VI_UNLOCK(vp); 6209 return (1); 6210 } 6211 6212 static int 6213 filt_vfsvnode(struct knote *kn, long hint) 6214 { 6215 struct vnode *vp = (struct vnode *)kn->kn_hook; 6216 int res; 6217 6218 VI_LOCK(vp); 6219 if (kn->kn_sfflags & hint) 6220 kn->kn_fflags |= hint; 6221 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6222 kn->kn_flags |= EV_EOF; 6223 VI_UNLOCK(vp); 6224 return (1); 6225 } 6226 res = (kn->kn_fflags != 0); 6227 VI_UNLOCK(vp); 6228 return (res); 6229 } 6230 6231 /* 6232 * Returns whether the directory is empty or not. 6233 * If it is empty, the return value is 0; otherwise 6234 * the return value is an error value (which may 6235 * be ENOTEMPTY). 6236 */ 6237 int 6238 vfs_emptydir(struct vnode *vp) 6239 { 6240 struct uio uio; 6241 struct iovec iov; 6242 struct dirent *dirent, *dp, *endp; 6243 int error, eof; 6244 6245 error = 0; 6246 eof = 0; 6247 6248 ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); 6249 6250 dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); 6251 iov.iov_base = dirent; 6252 iov.iov_len = sizeof(struct dirent); 6253 6254 uio.uio_iov = &iov; 6255 uio.uio_iovcnt = 1; 6256 uio.uio_offset = 0; 6257 uio.uio_resid = sizeof(struct dirent); 6258 uio.uio_segflg = UIO_SYSSPACE; 6259 uio.uio_rw = UIO_READ; 6260 uio.uio_td = curthread; 6261 6262 while (eof == 0 && error == 0) { 6263 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, 6264 NULL, NULL); 6265 if (error != 0) 6266 break; 6267 endp = (void *)((uint8_t *)dirent + 6268 sizeof(struct dirent) - uio.uio_resid); 6269 for (dp = dirent; dp < endp; 6270 dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { 6271 if (dp->d_type == DT_WHT) 6272 continue; 6273 if (dp->d_namlen == 0) 6274 continue; 6275 if (dp->d_type != DT_DIR && 6276 dp->d_type != DT_UNKNOWN) { 6277 error = ENOTEMPTY; 6278 break; 6279 } 6280 if (dp->d_namlen > 2) { 6281 error = ENOTEMPTY; 6282 break; 6283 } 6284 if (dp->d_namlen == 1 && 6285 dp->d_name[0] != '.') { 6286 error = ENOTEMPTY; 6287 break; 6288 } 6289 if (dp->d_namlen == 2 && 6290 dp->d_name[1] != '.') { 6291 error = ENOTEMPTY; 6292 break; 6293 } 6294 uio.uio_resid = sizeof(struct dirent); 6295 } 6296 } 6297 free(dirent, M_TEMP); 6298 return (error); 6299 } 6300 6301 int 6302 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6303 { 6304 int error; 6305 6306 if (dp->d_reclen > ap->a_uio->uio_resid) 6307 return (ENAMETOOLONG); 6308 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6309 if (error) { 6310 if (ap->a_ncookies != NULL) { 6311 if (ap->a_cookies != NULL) 6312 free(ap->a_cookies, M_TEMP); 6313 ap->a_cookies = NULL; 6314 *ap->a_ncookies = 0; 6315 } 6316 return (error); 6317 } 6318 if (ap->a_ncookies == NULL) 6319 return (0); 6320 6321 KASSERT(ap->a_cookies, 6322 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6323 6324 *ap->a_cookies = realloc(*ap->a_cookies, 6325 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 6326 (*ap->a_cookies)[*ap->a_ncookies] = off; 6327 *ap->a_ncookies += 1; 6328 return (0); 6329 } 6330 6331 /* 6332 * The purpose of this routine is to remove granularity from accmode_t, 6333 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6334 * VADMIN and VAPPEND. 6335 * 6336 * If it returns 0, the caller is supposed to continue with the usual 6337 * access checks using 'accmode' as modified by this routine. If it 6338 * returns nonzero value, the caller is supposed to return that value 6339 * as errno. 6340 * 6341 * Note that after this routine runs, accmode may be zero. 6342 */ 6343 int 6344 vfs_unixify_accmode(accmode_t *accmode) 6345 { 6346 /* 6347 * There is no way to specify explicit "deny" rule using 6348 * file mode or POSIX.1e ACLs. 6349 */ 6350 if (*accmode & VEXPLICIT_DENY) { 6351 *accmode = 0; 6352 return (0); 6353 } 6354 6355 /* 6356 * None of these can be translated into usual access bits. 6357 * Also, the common case for NFSv4 ACLs is to not contain 6358 * either of these bits. Caller should check for VWRITE 6359 * on the containing directory instead. 6360 */ 6361 if (*accmode & (VDELETE_CHILD | VDELETE)) 6362 return (EPERM); 6363 6364 if (*accmode & VADMIN_PERMS) { 6365 *accmode &= ~VADMIN_PERMS; 6366 *accmode |= VADMIN; 6367 } 6368 6369 /* 6370 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6371 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6372 */ 6373 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6374 6375 return (0); 6376 } 6377 6378 /* 6379 * Clear out a doomed vnode (if any) and replace it with a new one as long 6380 * as the fs is not being unmounted. Return the root vnode to the caller. 6381 */ 6382 static int __noinline 6383 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6384 { 6385 struct vnode *vp; 6386 int error; 6387 6388 restart: 6389 if (mp->mnt_rootvnode != NULL) { 6390 MNT_ILOCK(mp); 6391 vp = mp->mnt_rootvnode; 6392 if (vp != NULL) { 6393 if (!VN_IS_DOOMED(vp)) { 6394 vrefact(vp); 6395 MNT_IUNLOCK(mp); 6396 error = vn_lock(vp, flags); 6397 if (error == 0) { 6398 *vpp = vp; 6399 return (0); 6400 } 6401 vrele(vp); 6402 goto restart; 6403 } 6404 /* 6405 * Clear the old one. 6406 */ 6407 mp->mnt_rootvnode = NULL; 6408 } 6409 MNT_IUNLOCK(mp); 6410 if (vp != NULL) { 6411 vfs_op_barrier_wait(mp); 6412 vrele(vp); 6413 } 6414 } 6415 error = VFS_CACHEDROOT(mp, flags, vpp); 6416 if (error != 0) 6417 return (error); 6418 if (mp->mnt_vfs_ops == 0) { 6419 MNT_ILOCK(mp); 6420 if (mp->mnt_vfs_ops != 0) { 6421 MNT_IUNLOCK(mp); 6422 return (0); 6423 } 6424 if (mp->mnt_rootvnode == NULL) { 6425 vrefact(*vpp); 6426 mp->mnt_rootvnode = *vpp; 6427 } else { 6428 if (mp->mnt_rootvnode != *vpp) { 6429 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6430 panic("%s: mismatch between vnode returned " 6431 " by VFS_CACHEDROOT and the one cached " 6432 " (%p != %p)", 6433 __func__, *vpp, mp->mnt_rootvnode); 6434 } 6435 } 6436 } 6437 MNT_IUNLOCK(mp); 6438 } 6439 return (0); 6440 } 6441 6442 int 6443 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6444 { 6445 struct mount_pcpu *mpcpu; 6446 struct vnode *vp; 6447 int error; 6448 6449 if (!vfs_op_thread_enter(mp, mpcpu)) 6450 return (vfs_cache_root_fallback(mp, flags, vpp)); 6451 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6452 if (vp == NULL || VN_IS_DOOMED(vp)) { 6453 vfs_op_thread_exit(mp, mpcpu); 6454 return (vfs_cache_root_fallback(mp, flags, vpp)); 6455 } 6456 vrefact(vp); 6457 vfs_op_thread_exit(mp, mpcpu); 6458 error = vn_lock(vp, flags); 6459 if (error != 0) { 6460 vrele(vp); 6461 return (vfs_cache_root_fallback(mp, flags, vpp)); 6462 } 6463 *vpp = vp; 6464 return (0); 6465 } 6466 6467 struct vnode * 6468 vfs_cache_root_clear(struct mount *mp) 6469 { 6470 struct vnode *vp; 6471 6472 /* 6473 * ops > 0 guarantees there is nobody who can see this vnode 6474 */ 6475 MPASS(mp->mnt_vfs_ops > 0); 6476 vp = mp->mnt_rootvnode; 6477 if (vp != NULL) 6478 vn_seqc_write_begin(vp); 6479 mp->mnt_rootvnode = NULL; 6480 return (vp); 6481 } 6482 6483 void 6484 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 6485 { 6486 6487 MPASS(mp->mnt_vfs_ops > 0); 6488 vrefact(vp); 6489 mp->mnt_rootvnode = vp; 6490 } 6491 6492 /* 6493 * These are helper functions for filesystems to traverse all 6494 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 6495 * 6496 * This interface replaces MNT_VNODE_FOREACH. 6497 */ 6498 6499 struct vnode * 6500 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 6501 { 6502 struct vnode *vp; 6503 6504 if (should_yield()) 6505 kern_yield(PRI_USER); 6506 MNT_ILOCK(mp); 6507 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6508 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 6509 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 6510 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6511 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6512 continue; 6513 VI_LOCK(vp); 6514 if (VN_IS_DOOMED(vp)) { 6515 VI_UNLOCK(vp); 6516 continue; 6517 } 6518 break; 6519 } 6520 if (vp == NULL) { 6521 __mnt_vnode_markerfree_all(mvp, mp); 6522 /* MNT_IUNLOCK(mp); -- done in above function */ 6523 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 6524 return (NULL); 6525 } 6526 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6527 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6528 MNT_IUNLOCK(mp); 6529 return (vp); 6530 } 6531 6532 struct vnode * 6533 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 6534 { 6535 struct vnode *vp; 6536 6537 *mvp = vn_alloc_marker(mp); 6538 MNT_ILOCK(mp); 6539 MNT_REF(mp); 6540 6541 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 6542 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6543 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6544 continue; 6545 VI_LOCK(vp); 6546 if (VN_IS_DOOMED(vp)) { 6547 VI_UNLOCK(vp); 6548 continue; 6549 } 6550 break; 6551 } 6552 if (vp == NULL) { 6553 MNT_REL(mp); 6554 MNT_IUNLOCK(mp); 6555 vn_free_marker(*mvp); 6556 *mvp = NULL; 6557 return (NULL); 6558 } 6559 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6560 MNT_IUNLOCK(mp); 6561 return (vp); 6562 } 6563 6564 void 6565 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 6566 { 6567 6568 if (*mvp == NULL) { 6569 MNT_IUNLOCK(mp); 6570 return; 6571 } 6572 6573 mtx_assert(MNT_MTX(mp), MA_OWNED); 6574 6575 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6576 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6577 MNT_REL(mp); 6578 MNT_IUNLOCK(mp); 6579 vn_free_marker(*mvp); 6580 *mvp = NULL; 6581 } 6582 6583 /* 6584 * These are helper functions for filesystems to traverse their 6585 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 6586 */ 6587 static void 6588 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6589 { 6590 6591 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6592 6593 MNT_ILOCK(mp); 6594 MNT_REL(mp); 6595 MNT_IUNLOCK(mp); 6596 vn_free_marker(*mvp); 6597 *mvp = NULL; 6598 } 6599 6600 /* 6601 * Relock the mp mount vnode list lock with the vp vnode interlock in the 6602 * conventional lock order during mnt_vnode_next_lazy iteration. 6603 * 6604 * On entry, the mount vnode list lock is held and the vnode interlock is not. 6605 * The list lock is dropped and reacquired. On success, both locks are held. 6606 * On failure, the mount vnode list lock is held but the vnode interlock is 6607 * not, and the procedure may have yielded. 6608 */ 6609 static bool 6610 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 6611 struct vnode *vp) 6612 { 6613 6614 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 6615 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 6616 ("%s: bad marker", __func__)); 6617 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 6618 ("%s: inappropriate vnode", __func__)); 6619 ASSERT_VI_UNLOCKED(vp, __func__); 6620 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6621 6622 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 6623 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 6624 6625 /* 6626 * Note we may be racing against vdrop which transitioned the hold 6627 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 6628 * if we are the only user after we get the interlock we will just 6629 * vdrop. 6630 */ 6631 vhold(vp); 6632 mtx_unlock(&mp->mnt_listmtx); 6633 VI_LOCK(vp); 6634 if (VN_IS_DOOMED(vp)) { 6635 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 6636 goto out_lost; 6637 } 6638 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 6639 /* 6640 * There is nothing to do if we are the last user. 6641 */ 6642 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 6643 goto out_lost; 6644 mtx_lock(&mp->mnt_listmtx); 6645 return (true); 6646 out_lost: 6647 vdropl(vp); 6648 maybe_yield(); 6649 mtx_lock(&mp->mnt_listmtx); 6650 return (false); 6651 } 6652 6653 static struct vnode * 6654 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6655 void *cbarg) 6656 { 6657 struct vnode *vp; 6658 6659 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6660 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6661 restart: 6662 vp = TAILQ_NEXT(*mvp, v_lazylist); 6663 while (vp != NULL) { 6664 if (vp->v_type == VMARKER) { 6665 vp = TAILQ_NEXT(vp, v_lazylist); 6666 continue; 6667 } 6668 /* 6669 * See if we want to process the vnode. Note we may encounter a 6670 * long string of vnodes we don't care about and hog the list 6671 * as a result. Check for it and requeue the marker. 6672 */ 6673 VNPASS(!VN_IS_DOOMED(vp), vp); 6674 if (!cb(vp, cbarg)) { 6675 if (!should_yield()) { 6676 vp = TAILQ_NEXT(vp, v_lazylist); 6677 continue; 6678 } 6679 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 6680 v_lazylist); 6681 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 6682 v_lazylist); 6683 mtx_unlock(&mp->mnt_listmtx); 6684 kern_yield(PRI_USER); 6685 mtx_lock(&mp->mnt_listmtx); 6686 goto restart; 6687 } 6688 /* 6689 * Try-lock because this is the wrong lock order. 6690 */ 6691 if (!VI_TRYLOCK(vp) && 6692 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 6693 goto restart; 6694 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 6695 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 6696 ("alien vnode on the lazy list %p %p", vp, mp)); 6697 VNPASS(vp->v_mount == mp, vp); 6698 VNPASS(!VN_IS_DOOMED(vp), vp); 6699 break; 6700 } 6701 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6702 6703 /* Check if we are done */ 6704 if (vp == NULL) { 6705 mtx_unlock(&mp->mnt_listmtx); 6706 mnt_vnode_markerfree_lazy(mvp, mp); 6707 return (NULL); 6708 } 6709 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 6710 mtx_unlock(&mp->mnt_listmtx); 6711 ASSERT_VI_LOCKED(vp, "lazy iter"); 6712 return (vp); 6713 } 6714 6715 struct vnode * 6716 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6717 void *cbarg) 6718 { 6719 6720 if (should_yield()) 6721 kern_yield(PRI_USER); 6722 mtx_lock(&mp->mnt_listmtx); 6723 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6724 } 6725 6726 struct vnode * 6727 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6728 void *cbarg) 6729 { 6730 struct vnode *vp; 6731 6732 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 6733 return (NULL); 6734 6735 *mvp = vn_alloc_marker(mp); 6736 MNT_ILOCK(mp); 6737 MNT_REF(mp); 6738 MNT_IUNLOCK(mp); 6739 6740 mtx_lock(&mp->mnt_listmtx); 6741 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 6742 if (vp == NULL) { 6743 mtx_unlock(&mp->mnt_listmtx); 6744 mnt_vnode_markerfree_lazy(mvp, mp); 6745 return (NULL); 6746 } 6747 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 6748 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6749 } 6750 6751 void 6752 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6753 { 6754 6755 if (*mvp == NULL) 6756 return; 6757 6758 mtx_lock(&mp->mnt_listmtx); 6759 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6760 mtx_unlock(&mp->mnt_listmtx); 6761 mnt_vnode_markerfree_lazy(mvp, mp); 6762 } 6763 6764 int 6765 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 6766 { 6767 6768 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 6769 cnp->cn_flags &= ~NOEXECCHECK; 6770 return (0); 6771 } 6772 6773 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread)); 6774 } 6775 6776 /* 6777 * Do not use this variant unless you have means other than the hold count 6778 * to prevent the vnode from getting freed. 6779 */ 6780 void 6781 vn_seqc_write_begin_unheld_locked(struct vnode *vp) 6782 { 6783 6784 ASSERT_VI_LOCKED(vp, __func__); 6785 VNPASS(vp->v_seqc_users >= 0, vp); 6786 vp->v_seqc_users++; 6787 if (vp->v_seqc_users == 1) 6788 seqc_sleepable_write_begin(&vp->v_seqc); 6789 } 6790 6791 void 6792 vn_seqc_write_begin_locked(struct vnode *vp) 6793 { 6794 6795 ASSERT_VI_LOCKED(vp, __func__); 6796 VNPASS(vp->v_holdcnt > 0, vp); 6797 vn_seqc_write_begin_unheld_locked(vp); 6798 } 6799 6800 void 6801 vn_seqc_write_begin(struct vnode *vp) 6802 { 6803 6804 VI_LOCK(vp); 6805 vn_seqc_write_begin_locked(vp); 6806 VI_UNLOCK(vp); 6807 } 6808 6809 void 6810 vn_seqc_write_begin_unheld(struct vnode *vp) 6811 { 6812 6813 VI_LOCK(vp); 6814 vn_seqc_write_begin_unheld_locked(vp); 6815 VI_UNLOCK(vp); 6816 } 6817 6818 void 6819 vn_seqc_write_end_locked(struct vnode *vp) 6820 { 6821 6822 ASSERT_VI_LOCKED(vp, __func__); 6823 VNPASS(vp->v_seqc_users > 0, vp); 6824 vp->v_seqc_users--; 6825 if (vp->v_seqc_users == 0) 6826 seqc_sleepable_write_end(&vp->v_seqc); 6827 } 6828 6829 void 6830 vn_seqc_write_end(struct vnode *vp) 6831 { 6832 6833 VI_LOCK(vp); 6834 vn_seqc_write_end_locked(vp); 6835 VI_UNLOCK(vp); 6836 } 6837 6838 /* 6839 * Special case handling for allocating and freeing vnodes. 6840 * 6841 * The counter remains unchanged on free so that a doomed vnode will 6842 * keep testing as in modify as long as it is accessible with SMR. 6843 */ 6844 static void 6845 vn_seqc_init(struct vnode *vp) 6846 { 6847 6848 vp->v_seqc = 0; 6849 vp->v_seqc_users = 0; 6850 } 6851 6852 static void 6853 vn_seqc_write_end_free(struct vnode *vp) 6854 { 6855 6856 VNPASS(seqc_in_modify(vp->v_seqc), vp); 6857 VNPASS(vp->v_seqc_users == 1, vp); 6858 } 6859 6860 void 6861 vn_irflag_set_locked(struct vnode *vp, short toset) 6862 { 6863 short flags; 6864 6865 ASSERT_VI_LOCKED(vp, __func__); 6866 flags = vn_irflag_read(vp); 6867 VNASSERT((flags & toset) == 0, vp, 6868 ("%s: some of the passed flags already set (have %d, passed %d)\n", 6869 __func__, flags, toset)); 6870 atomic_store_short(&vp->v_irflag, flags | toset); 6871 } 6872 6873 void 6874 vn_irflag_set(struct vnode *vp, short toset) 6875 { 6876 6877 VI_LOCK(vp); 6878 vn_irflag_set_locked(vp, toset); 6879 VI_UNLOCK(vp); 6880 } 6881 6882 void 6883 vn_irflag_set_cond_locked(struct vnode *vp, short toset) 6884 { 6885 short flags; 6886 6887 ASSERT_VI_LOCKED(vp, __func__); 6888 flags = vn_irflag_read(vp); 6889 atomic_store_short(&vp->v_irflag, flags | toset); 6890 } 6891 6892 void 6893 vn_irflag_set_cond(struct vnode *vp, short toset) 6894 { 6895 6896 VI_LOCK(vp); 6897 vn_irflag_set_cond_locked(vp, toset); 6898 VI_UNLOCK(vp); 6899 } 6900 6901 void 6902 vn_irflag_unset_locked(struct vnode *vp, short tounset) 6903 { 6904 short flags; 6905 6906 ASSERT_VI_LOCKED(vp, __func__); 6907 flags = vn_irflag_read(vp); 6908 VNASSERT((flags & tounset) == tounset, vp, 6909 ("%s: some of the passed flags not set (have %d, passed %d)\n", 6910 __func__, flags, tounset)); 6911 atomic_store_short(&vp->v_irflag, flags & ~tounset); 6912 } 6913 6914 void 6915 vn_irflag_unset(struct vnode *vp, short tounset) 6916 { 6917 6918 VI_LOCK(vp); 6919 vn_irflag_unset_locked(vp, tounset); 6920 VI_UNLOCK(vp); 6921 } 6922