1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_watchdog.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/asan.h> 48 #include <sys/bio.h> 49 #include <sys/buf.h> 50 #include <sys/capsicum.h> 51 #include <sys/condvar.h> 52 #include <sys/conf.h> 53 #include <sys/counter.h> 54 #include <sys/dirent.h> 55 #include <sys/event.h> 56 #include <sys/eventhandler.h> 57 #include <sys/extattr.h> 58 #include <sys/file.h> 59 #include <sys/fcntl.h> 60 #include <sys/jail.h> 61 #include <sys/kdb.h> 62 #include <sys/kernel.h> 63 #include <sys/kthread.h> 64 #include <sys/ktr.h> 65 #include <sys/limits.h> 66 #include <sys/lockf.h> 67 #include <sys/malloc.h> 68 #include <sys/mount.h> 69 #include <sys/namei.h> 70 #include <sys/pctrie.h> 71 #include <sys/priv.h> 72 #include <sys/reboot.h> 73 #include <sys/refcount.h> 74 #include <sys/rwlock.h> 75 #include <sys/sched.h> 76 #include <sys/sleepqueue.h> 77 #include <sys/smr.h> 78 #include <sys/smp.h> 79 #include <sys/stat.h> 80 #include <sys/sysctl.h> 81 #include <sys/syslog.h> 82 #include <sys/user.h> 83 #include <sys/vmmeter.h> 84 #include <sys/vnode.h> 85 #include <sys/watchdog.h> 86 87 #include <machine/stdarg.h> 88 89 #include <security/mac/mac_framework.h> 90 91 #include <vm/vm.h> 92 #include <vm/vm_object.h> 93 #include <vm/vm_extern.h> 94 #include <vm/pmap.h> 95 #include <vm/vm_map.h> 96 #include <vm/vm_page.h> 97 #include <vm/vm_kern.h> 98 #include <vm/vnode_pager.h> 99 #include <vm/uma.h> 100 101 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS)) 102 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS 103 #endif 104 105 #ifdef DDB 106 #include <ddb/ddb.h> 107 #endif 108 109 static void delmntque(struct vnode *vp); 110 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 111 int slpflag, int slptimeo); 112 static void syncer_shutdown(void *arg, int howto); 113 static int vtryrecycle(struct vnode *vp, bool isvnlru); 114 static void v_init_counters(struct vnode *); 115 static void vn_seqc_init(struct vnode *); 116 static void vn_seqc_write_end_free(struct vnode *vp); 117 static void vgonel(struct vnode *); 118 static bool vhold_recycle_free(struct vnode *); 119 static void vdropl_recycle(struct vnode *vp); 120 static void vdrop_recycle(struct vnode *vp); 121 static void vfs_knllock(void *arg); 122 static void vfs_knlunlock(void *arg); 123 static void vfs_knl_assert_lock(void *arg, int what); 124 static void destroy_vpollinfo(struct vpollinfo *vi); 125 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 126 daddr_t startlbn, daddr_t endlbn); 127 static void vnlru_recalc(void); 128 129 static SYSCTL_NODE(_vfs, OID_AUTO, vnode, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 130 "vnode configuration and statistics"); 131 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 132 "vnode configuration"); 133 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 134 "vnode statistics"); 135 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, vnlru, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 136 "vnode recycling"); 137 138 /* 139 * Number of vnodes in existence. Increased whenever getnewvnode() 140 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 141 */ 142 static u_long __exclusive_cache_line numvnodes; 143 144 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 145 "Number of vnodes in existence (legacy)"); 146 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, count, CTLFLAG_RD, &numvnodes, 0, 147 "Number of vnodes in existence"); 148 149 static counter_u64_t vnodes_created; 150 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 151 "Number of vnodes created by getnewvnode (legacy)"); 152 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, created, CTLFLAG_RD, &vnodes_created, 153 "Number of vnodes created by getnewvnode"); 154 155 /* 156 * Conversion tables for conversion from vnode types to inode formats 157 * and back. 158 */ 159 __enum_uint8(vtype) iftovt_tab[16] = { 160 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 161 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 162 }; 163 int vttoif_tab[10] = { 164 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 165 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 166 }; 167 168 /* 169 * List of allocates vnodes in the system. 170 */ 171 static TAILQ_HEAD(freelst, vnode) vnode_list; 172 static struct vnode *vnode_list_free_marker; 173 static struct vnode *vnode_list_reclaim_marker; 174 175 /* 176 * "Free" vnode target. Free vnodes are rarely completely free, but are 177 * just ones that are cheap to recycle. Usually they are for files which 178 * have been stat'd but not read; these usually have inode and namecache 179 * data attached to them. This target is the preferred minimum size of a 180 * sub-cache consisting mostly of such files. The system balances the size 181 * of this sub-cache with its complement to try to prevent either from 182 * thrashing while the other is relatively inactive. The targets express 183 * a preference for the best balance. 184 * 185 * "Above" this target there are 2 further targets (watermarks) related 186 * to recyling of free vnodes. In the best-operating case, the cache is 187 * exactly full, the free list has size between vlowat and vhiwat above the 188 * free target, and recycling from it and normal use maintains this state. 189 * Sometimes the free list is below vlowat or even empty, but this state 190 * is even better for immediate use provided the cache is not full. 191 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 192 * ones) to reach one of these states. The watermarks are currently hard- 193 * coded as 4% and 9% of the available space higher. These and the default 194 * of 25% for wantfreevnodes are too large if the memory size is large. 195 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 196 * whenever vnlru_proc() becomes active. 197 */ 198 static long wantfreevnodes; 199 static long __exclusive_cache_line freevnodes; 200 static long freevnodes_old; 201 202 static u_long recycles_count; 203 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, &recycles_count, 0, 204 "Number of vnodes recycled to meet vnode cache targets (legacy)"); 205 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, 206 &recycles_count, 0, 207 "Number of vnodes recycled to meet vnode cache targets"); 208 209 static u_long recycles_free_count; 210 SYSCTL_ULONG(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, 211 &recycles_free_count, 0, 212 "Number of free vnodes recycled to meet vnode cache targets (legacy)"); 213 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, 214 &recycles_free_count, 0, 215 "Number of free vnodes recycled to meet vnode cache targets"); 216 217 static counter_u64_t direct_recycles_free_count; 218 SYSCTL_COUNTER_U64(_vfs_vnode_vnlru, OID_AUTO, direct_recycles_free, CTLFLAG_RD, 219 &direct_recycles_free_count, 220 "Number of free vnodes recycled by vn_alloc callers to meet vnode cache targets"); 221 222 static counter_u64_t vnode_skipped_requeues; 223 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, skipped_requeues, CTLFLAG_RD, &vnode_skipped_requeues, 224 "Number of times LRU requeue was skipped due to lock contention"); 225 226 static __read_mostly bool vnode_can_skip_requeue; 227 SYSCTL_BOOL(_vfs_vnode_param, OID_AUTO, can_skip_requeue, CTLFLAG_RW, 228 &vnode_can_skip_requeue, 0, "Is LRU requeue skippable"); 229 230 static u_long deferred_inact; 231 SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, 232 &deferred_inact, 0, "Number of times inactive processing was deferred"); 233 234 /* To keep more than one thread at a time from running vfs_getnewfsid */ 235 static struct mtx mntid_mtx; 236 237 /* 238 * Lock for any access to the following: 239 * vnode_list 240 * numvnodes 241 * freevnodes 242 */ 243 static struct mtx __exclusive_cache_line vnode_list_mtx; 244 245 /* Publicly exported FS */ 246 struct nfs_public nfs_pub; 247 248 static uma_zone_t buf_trie_zone; 249 static smr_t buf_trie_smr; 250 251 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 252 static uma_zone_t vnode_zone; 253 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); 254 255 __read_frequently smr_t vfs_smr; 256 257 /* 258 * The workitem queue. 259 * 260 * It is useful to delay writes of file data and filesystem metadata 261 * for tens of seconds so that quickly created and deleted files need 262 * not waste disk bandwidth being created and removed. To realize this, 263 * we append vnodes to a "workitem" queue. When running with a soft 264 * updates implementation, most pending metadata dependencies should 265 * not wait for more than a few seconds. Thus, mounted on block devices 266 * are delayed only about a half the time that file data is delayed. 267 * Similarly, directory updates are more critical, so are only delayed 268 * about a third the time that file data is delayed. Thus, there are 269 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 270 * one each second (driven off the filesystem syncer process). The 271 * syncer_delayno variable indicates the next queue that is to be processed. 272 * Items that need to be processed soon are placed in this queue: 273 * 274 * syncer_workitem_pending[syncer_delayno] 275 * 276 * A delay of fifteen seconds is done by placing the request fifteen 277 * entries later in the queue: 278 * 279 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 280 * 281 */ 282 static int syncer_delayno; 283 static long syncer_mask; 284 LIST_HEAD(synclist, bufobj); 285 static struct synclist *syncer_workitem_pending; 286 /* 287 * The sync_mtx protects: 288 * bo->bo_synclist 289 * sync_vnode_count 290 * syncer_delayno 291 * syncer_state 292 * syncer_workitem_pending 293 * syncer_worklist_len 294 * rushjob 295 */ 296 static struct mtx sync_mtx; 297 static struct cv sync_wakeup; 298 299 #define SYNCER_MAXDELAY 32 300 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 301 static int syncdelay = 30; /* max time to delay syncing data */ 302 static int filedelay = 30; /* time to delay syncing files */ 303 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 304 "Time to delay syncing files (in seconds)"); 305 static int dirdelay = 29; /* time to delay syncing directories */ 306 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 307 "Time to delay syncing directories (in seconds)"); 308 static int metadelay = 28; /* time to delay syncing metadata */ 309 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 310 "Time to delay syncing metadata (in seconds)"); 311 static int rushjob; /* number of slots to run ASAP */ 312 static int stat_rush_requests; /* number of times I/O speeded up */ 313 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 314 "Number of times I/O speeded up (rush requests)"); 315 316 #define VDBATCH_SIZE 8 317 struct vdbatch { 318 u_int index; 319 struct mtx lock; 320 struct vnode *tab[VDBATCH_SIZE]; 321 }; 322 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 323 324 static void vdbatch_dequeue(struct vnode *vp); 325 326 /* 327 * The syncer will require at least SYNCER_MAXDELAY iterations to shutdown; 328 * we probably don't want to pause for the whole second each time. 329 */ 330 #define SYNCER_SHUTDOWN_SPEEDUP 32 331 static int sync_vnode_count; 332 static int syncer_worklist_len; 333 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 334 syncer_state; 335 336 /* Target for maximum number of vnodes. */ 337 u_long desiredvnodes; 338 static u_long gapvnodes; /* gap between wanted and desired */ 339 static u_long vhiwat; /* enough extras after expansion */ 340 static u_long vlowat; /* minimal extras before expansion */ 341 static bool vstir; /* nonzero to stir non-free vnodes */ 342 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 343 344 static u_long vnlru_read_freevnodes(void); 345 346 /* 347 * Note that no attempt is made to sanitize these parameters. 348 */ 349 static int 350 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 351 { 352 u_long val; 353 int error; 354 355 val = desiredvnodes; 356 error = sysctl_handle_long(oidp, &val, 0, req); 357 if (error != 0 || req->newptr == NULL) 358 return (error); 359 360 if (val == desiredvnodes) 361 return (0); 362 mtx_lock(&vnode_list_mtx); 363 desiredvnodes = val; 364 wantfreevnodes = desiredvnodes / 4; 365 vnlru_recalc(); 366 mtx_unlock(&vnode_list_mtx); 367 /* 368 * XXX There is no protection against multiple threads changing 369 * desiredvnodes at the same time. Locking above only helps vnlru and 370 * getnewvnode. 371 */ 372 vfs_hash_changesize(desiredvnodes); 373 cache_changesize(desiredvnodes); 374 return (0); 375 } 376 377 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 378 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 379 "LU", "Target for maximum number of vnodes (legacy)"); 380 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, limit, 381 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 382 "LU", "Target for maximum number of vnodes"); 383 384 static int 385 sysctl_freevnodes(SYSCTL_HANDLER_ARGS) 386 { 387 u_long rfreevnodes; 388 389 rfreevnodes = vnlru_read_freevnodes(); 390 return (sysctl_handle_long(oidp, &rfreevnodes, 0, req)); 391 } 392 393 SYSCTL_PROC(_vfs, OID_AUTO, freevnodes, 394 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, 395 "LU", "Number of \"free\" vnodes (legacy)"); 396 SYSCTL_PROC(_vfs_vnode_stats, OID_AUTO, free, 397 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, 398 "LU", "Number of \"free\" vnodes"); 399 400 static int 401 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 402 { 403 u_long val; 404 int error; 405 406 val = wantfreevnodes; 407 error = sysctl_handle_long(oidp, &val, 0, req); 408 if (error != 0 || req->newptr == NULL) 409 return (error); 410 411 if (val == wantfreevnodes) 412 return (0); 413 mtx_lock(&vnode_list_mtx); 414 wantfreevnodes = val; 415 vnlru_recalc(); 416 mtx_unlock(&vnode_list_mtx); 417 return (0); 418 } 419 420 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 421 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 422 "LU", "Target for minimum number of \"free\" vnodes (legacy)"); 423 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, wantfree, 424 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 425 "LU", "Target for minimum number of \"free\" vnodes"); 426 427 static int vnlru_nowhere; 428 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, failed_runs, CTLFLAG_RD | CTLFLAG_STATS, 429 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 430 431 static int 432 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 433 { 434 struct vnode *vp; 435 struct nameidata nd; 436 char *buf; 437 unsigned long ndflags; 438 int error; 439 440 if (req->newptr == NULL) 441 return (EINVAL); 442 if (req->newlen >= PATH_MAX) 443 return (E2BIG); 444 445 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 446 error = SYSCTL_IN(req, buf, req->newlen); 447 if (error != 0) 448 goto out; 449 450 buf[req->newlen] = '\0'; 451 452 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; 453 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); 454 if ((error = namei(&nd)) != 0) 455 goto out; 456 vp = nd.ni_vp; 457 458 if (VN_IS_DOOMED(vp)) { 459 /* 460 * This vnode is being recycled. Return != 0 to let the caller 461 * know that the sysctl had no effect. Return EAGAIN because a 462 * subsequent call will likely succeed (since namei will create 463 * a new vnode if necessary) 464 */ 465 error = EAGAIN; 466 goto putvnode; 467 } 468 469 vgone(vp); 470 putvnode: 471 vput(vp); 472 NDFREE_PNBUF(&nd); 473 out: 474 free(buf, M_TEMP); 475 return (error); 476 } 477 478 static int 479 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 480 { 481 struct thread *td = curthread; 482 struct vnode *vp; 483 struct file *fp; 484 int error; 485 int fd; 486 487 if (req->newptr == NULL) 488 return (EBADF); 489 490 error = sysctl_handle_int(oidp, &fd, 0, req); 491 if (error != 0) 492 return (error); 493 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 494 if (error != 0) 495 return (error); 496 vp = fp->f_vnode; 497 498 error = vn_lock(vp, LK_EXCLUSIVE); 499 if (error != 0) 500 goto drop; 501 502 vgone(vp); 503 VOP_UNLOCK(vp); 504 drop: 505 fdrop(fp, td); 506 return (error); 507 } 508 509 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 510 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 511 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 512 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 513 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 514 sysctl_ftry_reclaim_vnode, "I", 515 "Try to reclaim a vnode by its file descriptor"); 516 517 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 518 #define vnsz2log 8 519 #ifndef DEBUG_LOCKS 520 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && 521 sizeof(struct vnode) < 1UL << (vnsz2log + 1), 522 "vnsz2log needs to be updated"); 523 #endif 524 525 /* 526 * Support for the bufobj clean & dirty pctrie. 527 */ 528 static void * 529 buf_trie_alloc(struct pctrie *ptree) 530 { 531 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 532 } 533 534 static void 535 buf_trie_free(struct pctrie *ptree, void *node) 536 { 537 uma_zfree_smr(buf_trie_zone, node); 538 } 539 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 540 buf_trie_smr); 541 542 /* 543 * Lookup the next element greater than or equal to lblkno, accounting for the 544 * fact that, for pctries, negative values are greater than nonnegative ones. 545 */ 546 static struct buf * 547 buf_lookup_ge(struct bufv *bv, daddr_t lblkno) 548 { 549 struct buf *bp; 550 551 bp = BUF_PCTRIE_LOOKUP_GE(&bv->bv_root, lblkno); 552 if (bp == NULL && lblkno < 0) 553 bp = BUF_PCTRIE_LOOKUP_GE(&bv->bv_root, 0); 554 if (bp != NULL && bp->b_lblkno < lblkno) 555 bp = NULL; 556 return (bp); 557 } 558 559 /* 560 * Insert bp, and find the next element smaller than bp, accounting for the fact 561 * that, for pctries, negative values are greater than nonnegative ones. 562 */ 563 static int 564 buf_insert_lookup_le(struct bufv *bv, struct buf *bp, struct buf **n) 565 { 566 int error; 567 568 error = BUF_PCTRIE_INSERT_LOOKUP_LE(&bv->bv_root, bp, n); 569 if (error != EEXIST) { 570 if (*n == NULL && bp->b_lblkno >= 0) 571 *n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, ~0L); 572 if (*n != NULL && (*n)->b_lblkno >= bp->b_lblkno) 573 *n = NULL; 574 } 575 return (error); 576 } 577 578 /* 579 * Initialize the vnode management data structures. 580 * 581 * Reevaluate the following cap on the number of vnodes after the physical 582 * memory size exceeds 512GB. In the limit, as the physical memory size 583 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 584 */ 585 #ifndef MAXVNODES_MAX 586 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 587 #endif 588 589 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 590 591 static struct vnode * 592 vn_alloc_marker(struct mount *mp) 593 { 594 struct vnode *vp; 595 596 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 597 vp->v_type = VMARKER; 598 vp->v_mount = mp; 599 600 return (vp); 601 } 602 603 static void 604 vn_free_marker(struct vnode *vp) 605 { 606 607 MPASS(vp->v_type == VMARKER); 608 free(vp, M_VNODE_MARKER); 609 } 610 611 #ifdef KASAN 612 static int 613 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) 614 { 615 kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); 616 return (0); 617 } 618 619 static void 620 vnode_dtor(void *mem, int size, void *arg __unused) 621 { 622 size_t end1, end2, off1, off2; 623 624 _Static_assert(offsetof(struct vnode, v_vnodelist) < 625 offsetof(struct vnode, v_dbatchcpu), 626 "KASAN marks require updating"); 627 628 off1 = offsetof(struct vnode, v_vnodelist); 629 off2 = offsetof(struct vnode, v_dbatchcpu); 630 end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); 631 end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); 632 633 /* 634 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even 635 * after the vnode has been freed. Try to get some KASAN coverage by 636 * marking everything except those two fields as invalid. Because 637 * KASAN's tracking is not byte-granular, any preceding fields sharing 638 * the same 8-byte aligned word must also be marked valid. 639 */ 640 641 /* Handle the area from the start until v_vnodelist... */ 642 off1 = rounddown2(off1, KASAN_SHADOW_SCALE); 643 kasan_mark(mem, off1, off1, KASAN_UMA_FREED); 644 645 /* ... then the area between v_vnodelist and v_dbatchcpu ... */ 646 off1 = roundup2(end1, KASAN_SHADOW_SCALE); 647 off2 = rounddown2(off2, KASAN_SHADOW_SCALE); 648 if (off2 > off1) 649 kasan_mark((void *)((char *)mem + off1), off2 - off1, 650 off2 - off1, KASAN_UMA_FREED); 651 652 /* ... and finally the area from v_dbatchcpu to the end. */ 653 off2 = roundup2(end2, KASAN_SHADOW_SCALE); 654 kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, 655 KASAN_UMA_FREED); 656 } 657 #endif /* KASAN */ 658 659 /* 660 * Initialize a vnode as it first enters the zone. 661 */ 662 static int 663 vnode_init(void *mem, int size, int flags) 664 { 665 struct vnode *vp; 666 667 vp = mem; 668 bzero(vp, size); 669 /* 670 * Setup locks. 671 */ 672 vp->v_vnlock = &vp->v_lock; 673 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 674 /* 675 * By default, don't allow shared locks unless filesystems opt-in. 676 */ 677 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 678 LK_NOSHARE | LK_IS_VNODE); 679 /* 680 * Initialize bufobj. 681 */ 682 bufobj_init(&vp->v_bufobj, vp); 683 /* 684 * Initialize namecache. 685 */ 686 cache_vnode_init(vp); 687 /* 688 * Initialize rangelocks. 689 */ 690 rangelock_init(&vp->v_rl); 691 692 vp->v_dbatchcpu = NOCPU; 693 694 vp->v_state = VSTATE_DEAD; 695 696 /* 697 * Check vhold_recycle_free for an explanation. 698 */ 699 vp->v_holdcnt = VHOLD_NO_SMR; 700 vp->v_type = VNON; 701 mtx_lock(&vnode_list_mtx); 702 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 703 mtx_unlock(&vnode_list_mtx); 704 return (0); 705 } 706 707 /* 708 * Free a vnode when it is cleared from the zone. 709 */ 710 static void 711 vnode_fini(void *mem, int size) 712 { 713 struct vnode *vp; 714 struct bufobj *bo; 715 716 vp = mem; 717 vdbatch_dequeue(vp); 718 mtx_lock(&vnode_list_mtx); 719 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 720 mtx_unlock(&vnode_list_mtx); 721 rangelock_destroy(&vp->v_rl); 722 lockdestroy(vp->v_vnlock); 723 mtx_destroy(&vp->v_interlock); 724 bo = &vp->v_bufobj; 725 rw_destroy(BO_LOCKPTR(bo)); 726 727 kasan_mark(mem, size, size, 0); 728 } 729 730 /* 731 * Provide the size of NFS nclnode and NFS fh for calculation of the 732 * vnode memory consumption. The size is specified directly to 733 * eliminate dependency on NFS-private header. 734 * 735 * Other filesystems may use bigger or smaller (like UFS and ZFS) 736 * private inode data, but the NFS-based estimation is ample enough. 737 * Still, we care about differences in the size between 64- and 32-bit 738 * platforms. 739 * 740 * Namecache structure size is heuristically 741 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 742 */ 743 #ifdef _LP64 744 #define NFS_NCLNODE_SZ (528 + 64) 745 #define NC_SZ 148 746 #else 747 #define NFS_NCLNODE_SZ (360 + 32) 748 #define NC_SZ 92 749 #endif 750 751 static void 752 vntblinit(void *dummy __unused) 753 { 754 struct vdbatch *vd; 755 uma_ctor ctor; 756 uma_dtor dtor; 757 int cpu, physvnodes, virtvnodes; 758 759 /* 760 * Desiredvnodes is a function of the physical memory size and the 761 * kernel's heap size. Generally speaking, it scales with the 762 * physical memory size. The ratio of desiredvnodes to the physical 763 * memory size is 1:16 until desiredvnodes exceeds 98,304. 764 * Thereafter, the 765 * marginal ratio of desiredvnodes to the physical memory size is 766 * 1:64. However, desiredvnodes is limited by the kernel's heap 767 * size. The memory required by desiredvnodes vnodes and vm objects 768 * must not exceed 1/10th of the kernel's heap size. 769 */ 770 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 771 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 772 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 773 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 774 desiredvnodes = min(physvnodes, virtvnodes); 775 if (desiredvnodes > MAXVNODES_MAX) { 776 if (bootverbose) 777 printf("Reducing kern.maxvnodes %lu -> %lu\n", 778 desiredvnodes, MAXVNODES_MAX); 779 desiredvnodes = MAXVNODES_MAX; 780 } 781 wantfreevnodes = desiredvnodes / 4; 782 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 783 TAILQ_INIT(&vnode_list); 784 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 785 /* 786 * The lock is taken to appease WITNESS. 787 */ 788 mtx_lock(&vnode_list_mtx); 789 vnlru_recalc(); 790 mtx_unlock(&vnode_list_mtx); 791 vnode_list_free_marker = vn_alloc_marker(NULL); 792 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 793 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 794 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 795 796 #ifdef KASAN 797 ctor = vnode_ctor; 798 dtor = vnode_dtor; 799 #else 800 ctor = NULL; 801 dtor = NULL; 802 #endif 803 vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, 804 vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); 805 uma_zone_set_smr(vnode_zone, vfs_smr); 806 807 /* 808 * Preallocate enough nodes to support one-per buf so that 809 * we can not fail an insert. reassignbuf() callers can not 810 * tolerate the insertion failure. 811 */ 812 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 813 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 814 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 815 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 816 uma_prealloc(buf_trie_zone, nbuf); 817 818 vnodes_created = counter_u64_alloc(M_WAITOK); 819 direct_recycles_free_count = counter_u64_alloc(M_WAITOK); 820 vnode_skipped_requeues = counter_u64_alloc(M_WAITOK); 821 822 /* 823 * Initialize the filesystem syncer. 824 */ 825 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 826 &syncer_mask); 827 syncer_maxdelay = syncer_mask + 1; 828 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 829 cv_init(&sync_wakeup, "syncer"); 830 831 CPU_FOREACH(cpu) { 832 vd = DPCPU_ID_PTR((cpu), vd); 833 bzero(vd, sizeof(*vd)); 834 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 835 } 836 } 837 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 838 839 /* 840 * Mark a mount point as busy. Used to synchronize access and to delay 841 * unmounting. Eventually, mountlist_mtx is not released on failure. 842 * 843 * vfs_busy() is a custom lock, it can block the caller. 844 * vfs_busy() only sleeps if the unmount is active on the mount point. 845 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 846 * vnode belonging to mp. 847 * 848 * Lookup uses vfs_busy() to traverse mount points. 849 * root fs var fs 850 * / vnode lock A / vnode lock (/var) D 851 * /var vnode lock B /log vnode lock(/var/log) E 852 * vfs_busy lock C vfs_busy lock F 853 * 854 * Within each file system, the lock order is C->A->B and F->D->E. 855 * 856 * When traversing across mounts, the system follows that lock order: 857 * 858 * C->A->B 859 * | 860 * +->F->D->E 861 * 862 * The lookup() process for namei("/var") illustrates the process: 863 * 1. VOP_LOOKUP() obtains B while A is held 864 * 2. vfs_busy() obtains a shared lock on F while A and B are held 865 * 3. vput() releases lock on B 866 * 4. vput() releases lock on A 867 * 5. VFS_ROOT() obtains lock on D while shared lock on F is held 868 * 6. vfs_unbusy() releases shared lock on F 869 * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 870 * Attempt to lock A (instead of vp_crossmp) while D is held would 871 * violate the global order, causing deadlocks. 872 * 873 * dounmount() locks B while F is drained. Note that for stacked 874 * filesystems, D and B in the example above may be the same lock, 875 * which introdues potential lock order reversal deadlock between 876 * dounmount() and step 5 above. These filesystems may avoid the LOR 877 * by setting VV_CROSSLOCK on the covered vnode so that lock B will 878 * remain held until after step 5. 879 */ 880 int 881 vfs_busy(struct mount *mp, int flags) 882 { 883 struct mount_pcpu *mpcpu; 884 885 MPASS((flags & ~MBF_MASK) == 0); 886 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 887 888 if (vfs_op_thread_enter(mp, mpcpu)) { 889 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 890 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 891 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 892 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 893 vfs_mp_count_add_pcpu(mpcpu, lockref, 1); 894 vfs_op_thread_exit(mp, mpcpu); 895 if (flags & MBF_MNTLSTLOCK) 896 mtx_unlock(&mountlist_mtx); 897 return (0); 898 } 899 900 MNT_ILOCK(mp); 901 vfs_assert_mount_counters(mp); 902 MNT_REF(mp); 903 /* 904 * If mount point is currently being unmounted, sleep until the 905 * mount point fate is decided. If thread doing the unmounting fails, 906 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 907 * that this mount point has survived the unmount attempt and vfs_busy 908 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 909 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 910 * about to be really destroyed. vfs_busy needs to release its 911 * reference on the mount point in this case and return with ENOENT, 912 * telling the caller the mount it tried to busy is no longer valid. 913 */ 914 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 915 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), 916 ("%s: non-empty upper mount list with pending unmount", 917 __func__)); 918 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 919 MNT_REL(mp); 920 MNT_IUNLOCK(mp); 921 CTR1(KTR_VFS, "%s: failed busying before sleeping", 922 __func__); 923 return (ENOENT); 924 } 925 if (flags & MBF_MNTLSTLOCK) 926 mtx_unlock(&mountlist_mtx); 927 mp->mnt_kern_flag |= MNTK_MWAIT; 928 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 929 if (flags & MBF_MNTLSTLOCK) 930 mtx_lock(&mountlist_mtx); 931 MNT_ILOCK(mp); 932 } 933 if (flags & MBF_MNTLSTLOCK) 934 mtx_unlock(&mountlist_mtx); 935 mp->mnt_lockref++; 936 MNT_IUNLOCK(mp); 937 return (0); 938 } 939 940 /* 941 * Free a busy filesystem. 942 */ 943 void 944 vfs_unbusy(struct mount *mp) 945 { 946 struct mount_pcpu *mpcpu; 947 int c; 948 949 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 950 951 if (vfs_op_thread_enter(mp, mpcpu)) { 952 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 953 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); 954 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 955 vfs_op_thread_exit(mp, mpcpu); 956 return; 957 } 958 959 MNT_ILOCK(mp); 960 vfs_assert_mount_counters(mp); 961 MNT_REL(mp); 962 c = --mp->mnt_lockref; 963 if (mp->mnt_vfs_ops == 0) { 964 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 965 MNT_IUNLOCK(mp); 966 return; 967 } 968 if (c < 0) 969 vfs_dump_mount_counters(mp); 970 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 971 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 972 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 973 mp->mnt_kern_flag &= ~MNTK_DRAINING; 974 wakeup(&mp->mnt_lockref); 975 } 976 MNT_IUNLOCK(mp); 977 } 978 979 /* 980 * Lookup a mount point by filesystem identifier. 981 */ 982 struct mount * 983 vfs_getvfs(fsid_t *fsid) 984 { 985 struct mount *mp; 986 987 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 988 mtx_lock(&mountlist_mtx); 989 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 990 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 991 vfs_ref(mp); 992 mtx_unlock(&mountlist_mtx); 993 return (mp); 994 } 995 } 996 mtx_unlock(&mountlist_mtx); 997 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 998 return ((struct mount *) 0); 999 } 1000 1001 /* 1002 * Lookup a mount point by filesystem identifier, busying it before 1003 * returning. 1004 * 1005 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 1006 * cache for popular filesystem identifiers. The cache is lockess, using 1007 * the fact that struct mount's are never freed. In worst case we may 1008 * get pointer to unmounted or even different filesystem, so we have to 1009 * check what we got, and go slow way if so. 1010 */ 1011 struct mount * 1012 vfs_busyfs(fsid_t *fsid) 1013 { 1014 #define FSID_CACHE_SIZE 256 1015 typedef struct mount * volatile vmp_t; 1016 static vmp_t cache[FSID_CACHE_SIZE]; 1017 struct mount *mp; 1018 int error; 1019 uint32_t hash; 1020 1021 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 1022 hash = fsid->val[0] ^ fsid->val[1]; 1023 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 1024 mp = cache[hash]; 1025 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 1026 goto slow; 1027 if (vfs_busy(mp, 0) != 0) { 1028 cache[hash] = NULL; 1029 goto slow; 1030 } 1031 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 1032 return (mp); 1033 else 1034 vfs_unbusy(mp); 1035 1036 slow: 1037 mtx_lock(&mountlist_mtx); 1038 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 1039 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 1040 error = vfs_busy(mp, MBF_MNTLSTLOCK); 1041 if (error) { 1042 cache[hash] = NULL; 1043 mtx_unlock(&mountlist_mtx); 1044 return (NULL); 1045 } 1046 cache[hash] = mp; 1047 return (mp); 1048 } 1049 } 1050 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 1051 mtx_unlock(&mountlist_mtx); 1052 return ((struct mount *) 0); 1053 } 1054 1055 /* 1056 * Check if a user can access privileged mount options. 1057 */ 1058 int 1059 vfs_suser(struct mount *mp, struct thread *td) 1060 { 1061 int error; 1062 1063 if (jailed(td->td_ucred)) { 1064 /* 1065 * If the jail of the calling thread lacks permission for 1066 * this type of file system, deny immediately. 1067 */ 1068 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 1069 return (EPERM); 1070 1071 /* 1072 * If the file system was mounted outside the jail of the 1073 * calling thread, deny immediately. 1074 */ 1075 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 1076 return (EPERM); 1077 } 1078 1079 /* 1080 * If file system supports delegated administration, we don't check 1081 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 1082 * by the file system itself. 1083 * If this is not the user that did original mount, we check for 1084 * the PRIV_VFS_MOUNT_OWNER privilege. 1085 */ 1086 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 1087 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 1088 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 1089 return (error); 1090 } 1091 return (0); 1092 } 1093 1094 /* 1095 * Get a new unique fsid. Try to make its val[0] unique, since this value 1096 * will be used to create fake device numbers for stat(). Also try (but 1097 * not so hard) make its val[0] unique mod 2^16, since some emulators only 1098 * support 16-bit device numbers. We end up with unique val[0]'s for the 1099 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 1100 * 1101 * Keep in mind that several mounts may be running in parallel. Starting 1102 * the search one past where the previous search terminated is both a 1103 * micro-optimization and a defense against returning the same fsid to 1104 * different mounts. 1105 */ 1106 void 1107 vfs_getnewfsid(struct mount *mp) 1108 { 1109 static uint16_t mntid_base; 1110 struct mount *nmp; 1111 fsid_t tfsid; 1112 int mtype; 1113 1114 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 1115 mtx_lock(&mntid_mtx); 1116 mtype = mp->mnt_vfc->vfc_typenum; 1117 tfsid.val[1] = mtype; 1118 mtype = (mtype & 0xFF) << 24; 1119 for (;;) { 1120 tfsid.val[0] = makedev(255, 1121 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 1122 mntid_base++; 1123 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 1124 break; 1125 vfs_rel(nmp); 1126 } 1127 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 1128 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 1129 mtx_unlock(&mntid_mtx); 1130 } 1131 1132 /* 1133 * Knob to control the precision of file timestamps: 1134 * 1135 * 0 = seconds only; nanoseconds zeroed. 1136 * 1 = seconds and nanoseconds, accurate within 1/HZ. 1137 * 2 = seconds and nanoseconds, truncated to microseconds. 1138 * >=3 = seconds and nanoseconds, maximum precision. 1139 */ 1140 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 1141 1142 static int timestamp_precision = TSP_USEC; 1143 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 1144 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 1145 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 1146 "3+: sec + ns (max. precision))"); 1147 1148 /* 1149 * Get a current timestamp. 1150 */ 1151 void 1152 vfs_timestamp(struct timespec *tsp) 1153 { 1154 struct timeval tv; 1155 1156 switch (timestamp_precision) { 1157 case TSP_SEC: 1158 tsp->tv_sec = time_second; 1159 tsp->tv_nsec = 0; 1160 break; 1161 case TSP_HZ: 1162 getnanotime(tsp); 1163 break; 1164 case TSP_USEC: 1165 microtime(&tv); 1166 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1167 break; 1168 case TSP_NSEC: 1169 default: 1170 nanotime(tsp); 1171 break; 1172 } 1173 } 1174 1175 /* 1176 * Set vnode attributes to VNOVAL 1177 */ 1178 void 1179 vattr_null(struct vattr *vap) 1180 { 1181 1182 vap->va_type = VNON; 1183 vap->va_size = VNOVAL; 1184 vap->va_bytes = VNOVAL; 1185 vap->va_mode = VNOVAL; 1186 vap->va_nlink = VNOVAL; 1187 vap->va_uid = VNOVAL; 1188 vap->va_gid = VNOVAL; 1189 vap->va_fsid = VNOVAL; 1190 vap->va_fileid = VNOVAL; 1191 vap->va_blocksize = VNOVAL; 1192 vap->va_rdev = VNOVAL; 1193 vap->va_atime.tv_sec = VNOVAL; 1194 vap->va_atime.tv_nsec = VNOVAL; 1195 vap->va_mtime.tv_sec = VNOVAL; 1196 vap->va_mtime.tv_nsec = VNOVAL; 1197 vap->va_ctime.tv_sec = VNOVAL; 1198 vap->va_ctime.tv_nsec = VNOVAL; 1199 vap->va_birthtime.tv_sec = VNOVAL; 1200 vap->va_birthtime.tv_nsec = VNOVAL; 1201 vap->va_flags = VNOVAL; 1202 vap->va_gen = VNOVAL; 1203 vap->va_vaflags = 0; 1204 vap->va_filerev = VNOVAL; 1205 vap->va_bsdflags = 0; 1206 } 1207 1208 /* 1209 * Try to reduce the total number of vnodes. 1210 * 1211 * This routine (and its user) are buggy in at least the following ways: 1212 * - all parameters were picked years ago when RAM sizes were significantly 1213 * smaller 1214 * - it can pick vnodes based on pages used by the vm object, but filesystems 1215 * like ZFS don't use it making the pick broken 1216 * - since ZFS has its own aging policy it gets partially combated by this one 1217 * - a dedicated method should be provided for filesystems to let them decide 1218 * whether the vnode should be recycled 1219 * 1220 * This routine is called when we have too many vnodes. It attempts 1221 * to free <count> vnodes and will potentially free vnodes that still 1222 * have VM backing store (VM backing store is typically the cause 1223 * of a vnode blowout so we want to do this). Therefore, this operation 1224 * is not considered cheap. 1225 * 1226 * A number of conditions may prevent a vnode from being reclaimed. 1227 * the buffer cache may have references on the vnode, a directory 1228 * vnode may still have references due to the namei cache representing 1229 * underlying files, or the vnode may be in active use. It is not 1230 * desirable to reuse such vnodes. These conditions may cause the 1231 * number of vnodes to reach some minimum value regardless of what 1232 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1233 * 1234 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1235 * entries if this argument is strue 1236 * @param trigger Only reclaim vnodes with fewer than this many resident 1237 * pages. 1238 * @param target How many vnodes to reclaim. 1239 * @return The number of vnodes that were reclaimed. 1240 */ 1241 static int 1242 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1243 { 1244 struct vnode *vp, *mvp; 1245 struct mount *mp; 1246 struct vm_object *object; 1247 u_long done; 1248 bool retried; 1249 1250 mtx_assert(&vnode_list_mtx, MA_OWNED); 1251 1252 retried = false; 1253 done = 0; 1254 1255 mvp = vnode_list_reclaim_marker; 1256 restart: 1257 vp = mvp; 1258 while (done < target) { 1259 vp = TAILQ_NEXT(vp, v_vnodelist); 1260 if (__predict_false(vp == NULL)) 1261 break; 1262 1263 if (__predict_false(vp->v_type == VMARKER)) 1264 continue; 1265 1266 /* 1267 * If it's been deconstructed already, it's still 1268 * referenced, or it exceeds the trigger, skip it. 1269 * Also skip free vnodes. We are trying to make space 1270 * for more free vnodes, not reduce their count. 1271 */ 1272 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1273 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1274 goto next_iter; 1275 1276 if (vp->v_type == VBAD || vp->v_type == VNON) 1277 goto next_iter; 1278 1279 object = atomic_load_ptr(&vp->v_object); 1280 if (object == NULL || object->resident_page_count > trigger) { 1281 goto next_iter; 1282 } 1283 1284 /* 1285 * Handle races against vnode allocation. Filesystems lock the 1286 * vnode some time after it gets returned from getnewvnode, 1287 * despite type and hold count being manipulated earlier. 1288 * Resorting to checking v_mount restores guarantees present 1289 * before the global list was reworked to contain all vnodes. 1290 */ 1291 if (!VI_TRYLOCK(vp)) 1292 goto next_iter; 1293 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1294 VI_UNLOCK(vp); 1295 goto next_iter; 1296 } 1297 if (vp->v_mount == NULL) { 1298 VI_UNLOCK(vp); 1299 goto next_iter; 1300 } 1301 vholdl(vp); 1302 VI_UNLOCK(vp); 1303 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1304 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1305 mtx_unlock(&vnode_list_mtx); 1306 1307 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1308 vdrop_recycle(vp); 1309 goto next_iter_unlocked; 1310 } 1311 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1312 vdrop_recycle(vp); 1313 vn_finished_write(mp); 1314 goto next_iter_unlocked; 1315 } 1316 1317 VI_LOCK(vp); 1318 if (vp->v_usecount > 0 || 1319 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1320 (vp->v_object != NULL && vp->v_object->handle == vp && 1321 vp->v_object->resident_page_count > trigger)) { 1322 VOP_UNLOCK(vp); 1323 vdropl_recycle(vp); 1324 vn_finished_write(mp); 1325 goto next_iter_unlocked; 1326 } 1327 recycles_count++; 1328 vgonel(vp); 1329 VOP_UNLOCK(vp); 1330 vdropl_recycle(vp); 1331 vn_finished_write(mp); 1332 done++; 1333 next_iter_unlocked: 1334 maybe_yield(); 1335 mtx_lock(&vnode_list_mtx); 1336 goto restart; 1337 next_iter: 1338 MPASS(vp->v_type != VMARKER); 1339 if (!should_yield()) 1340 continue; 1341 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1342 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1343 mtx_unlock(&vnode_list_mtx); 1344 kern_yield(PRI_USER); 1345 mtx_lock(&vnode_list_mtx); 1346 goto restart; 1347 } 1348 if (done == 0 && !retried) { 1349 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1350 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1351 retried = true; 1352 goto restart; 1353 } 1354 return (done); 1355 } 1356 1357 static int max_free_per_call = 10000; 1358 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_free_per_call, 0, 1359 "limit on vnode free requests per call to the vnlru_free routine (legacy)"); 1360 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, max_free_per_call, CTLFLAG_RW, 1361 &max_free_per_call, 0, 1362 "limit on vnode free requests per call to the vnlru_free routine"); 1363 1364 /* 1365 * Attempt to recycle requested amount of free vnodes. 1366 */ 1367 static int 1368 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp, bool isvnlru) 1369 { 1370 struct vnode *vp; 1371 struct mount *mp; 1372 int ocount; 1373 bool retried; 1374 1375 mtx_assert(&vnode_list_mtx, MA_OWNED); 1376 if (count > max_free_per_call) 1377 count = max_free_per_call; 1378 if (count == 0) { 1379 mtx_unlock(&vnode_list_mtx); 1380 return (0); 1381 } 1382 ocount = count; 1383 retried = false; 1384 vp = mvp; 1385 for (;;) { 1386 vp = TAILQ_NEXT(vp, v_vnodelist); 1387 if (__predict_false(vp == NULL)) { 1388 /* 1389 * The free vnode marker can be past eligible vnodes: 1390 * 1. if vdbatch_process trylock failed 1391 * 2. if vtryrecycle failed 1392 * 1393 * If so, start the scan from scratch. 1394 */ 1395 if (!retried && vnlru_read_freevnodes() > 0) { 1396 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1397 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1398 vp = mvp; 1399 retried = true; 1400 continue; 1401 } 1402 1403 /* 1404 * Give up 1405 */ 1406 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1407 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1408 mtx_unlock(&vnode_list_mtx); 1409 break; 1410 } 1411 if (__predict_false(vp->v_type == VMARKER)) 1412 continue; 1413 if (vp->v_holdcnt > 0) 1414 continue; 1415 /* 1416 * Don't recycle if our vnode is from different type 1417 * of mount point. Note that mp is type-safe, the 1418 * check does not reach unmapped address even if 1419 * vnode is reclaimed. 1420 */ 1421 if (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1422 mp->mnt_op != mnt_op) { 1423 continue; 1424 } 1425 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1426 continue; 1427 } 1428 if (!vhold_recycle_free(vp)) 1429 continue; 1430 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1431 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1432 mtx_unlock(&vnode_list_mtx); 1433 /* 1434 * FIXME: ignores the return value, meaning it may be nothing 1435 * got recycled but it claims otherwise to the caller. 1436 * 1437 * Originally the value started being ignored in 2005 with 1438 * 114a1006a8204aa156e1f9ad6476cdff89cada7f . 1439 * 1440 * Respecting the value can run into significant stalls if most 1441 * vnodes belong to one file system and it has writes 1442 * suspended. In presence of many threads and millions of 1443 * vnodes they keep contending on the vnode_list_mtx lock only 1444 * to find vnodes they can't recycle. 1445 * 1446 * The solution would be to pre-check if the vnode is likely to 1447 * be recycle-able, but it needs to happen with the 1448 * vnode_list_mtx lock held. This runs into a problem where 1449 * VOP_GETWRITEMOUNT (currently needed to find out about if 1450 * writes are frozen) can take locks which LOR against it. 1451 * 1452 * Check nullfs for one example (null_getwritemount). 1453 */ 1454 vtryrecycle(vp, isvnlru); 1455 count--; 1456 if (count == 0) { 1457 break; 1458 } 1459 mtx_lock(&vnode_list_mtx); 1460 vp = mvp; 1461 } 1462 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1463 return (ocount - count); 1464 } 1465 1466 /* 1467 * XXX: returns without vnode_list_mtx locked! 1468 */ 1469 static int 1470 vnlru_free_locked_direct(int count) 1471 { 1472 int ret; 1473 1474 mtx_assert(&vnode_list_mtx, MA_OWNED); 1475 ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, false); 1476 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1477 return (ret); 1478 } 1479 1480 static int 1481 vnlru_free_locked_vnlru(int count) 1482 { 1483 int ret; 1484 1485 mtx_assert(&vnode_list_mtx, MA_OWNED); 1486 ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, true); 1487 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1488 return (ret); 1489 } 1490 1491 static int 1492 vnlru_free_vnlru(int count) 1493 { 1494 1495 mtx_lock(&vnode_list_mtx); 1496 return (vnlru_free_locked_vnlru(count)); 1497 } 1498 1499 void 1500 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) 1501 { 1502 1503 MPASS(mnt_op != NULL); 1504 MPASS(mvp != NULL); 1505 VNPASS(mvp->v_type == VMARKER, mvp); 1506 mtx_lock(&vnode_list_mtx); 1507 vnlru_free_impl(count, mnt_op, mvp, true); 1508 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1509 } 1510 1511 struct vnode * 1512 vnlru_alloc_marker(void) 1513 { 1514 struct vnode *mvp; 1515 1516 mvp = vn_alloc_marker(NULL); 1517 mtx_lock(&vnode_list_mtx); 1518 TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); 1519 mtx_unlock(&vnode_list_mtx); 1520 return (mvp); 1521 } 1522 1523 void 1524 vnlru_free_marker(struct vnode *mvp) 1525 { 1526 mtx_lock(&vnode_list_mtx); 1527 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1528 mtx_unlock(&vnode_list_mtx); 1529 vn_free_marker(mvp); 1530 } 1531 1532 static void 1533 vnlru_recalc(void) 1534 { 1535 1536 mtx_assert(&vnode_list_mtx, MA_OWNED); 1537 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1538 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1539 vlowat = vhiwat / 2; 1540 } 1541 1542 /* 1543 * Attempt to recycle vnodes in a context that is always safe to block. 1544 * Calling vlrurecycle() from the bowels of filesystem code has some 1545 * interesting deadlock problems. 1546 */ 1547 static struct proc *vnlruproc; 1548 static int vnlruproc_sig; 1549 static u_long vnlruproc_kicks; 1550 1551 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, kicks, CTLFLAG_RD, &vnlruproc_kicks, 0, 1552 "Number of times vnlru awakened due to vnode shortage"); 1553 1554 #define VNLRU_COUNT_SLOP 100 1555 1556 /* 1557 * The main freevnodes counter is only updated when a counter local to CPU 1558 * diverges from 0 by more than VNLRU_FREEVNODES_SLOP. CPUs are conditionally 1559 * walked to compute a more accurate total. 1560 * 1561 * Note: the actual value at any given moment can still exceed slop, but it 1562 * should not be by significant margin in practice. 1563 */ 1564 #define VNLRU_FREEVNODES_SLOP 126 1565 1566 static void __noinline 1567 vfs_freevnodes_rollup(int8_t *lfreevnodes) 1568 { 1569 1570 atomic_add_long(&freevnodes, *lfreevnodes); 1571 *lfreevnodes = 0; 1572 critical_exit(); 1573 } 1574 1575 static __inline void 1576 vfs_freevnodes_inc(void) 1577 { 1578 int8_t *lfreevnodes; 1579 1580 critical_enter(); 1581 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1582 (*lfreevnodes)++; 1583 if (__predict_false(*lfreevnodes == VNLRU_FREEVNODES_SLOP)) 1584 vfs_freevnodes_rollup(lfreevnodes); 1585 else 1586 critical_exit(); 1587 } 1588 1589 static __inline void 1590 vfs_freevnodes_dec(void) 1591 { 1592 int8_t *lfreevnodes; 1593 1594 critical_enter(); 1595 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1596 (*lfreevnodes)--; 1597 if (__predict_false(*lfreevnodes == -VNLRU_FREEVNODES_SLOP)) 1598 vfs_freevnodes_rollup(lfreevnodes); 1599 else 1600 critical_exit(); 1601 } 1602 1603 static u_long 1604 vnlru_read_freevnodes(void) 1605 { 1606 long slop, rfreevnodes, rfreevnodes_old; 1607 int cpu; 1608 1609 rfreevnodes = atomic_load_long(&freevnodes); 1610 rfreevnodes_old = atomic_load_long(&freevnodes_old); 1611 1612 if (rfreevnodes > rfreevnodes_old) 1613 slop = rfreevnodes - rfreevnodes_old; 1614 else 1615 slop = rfreevnodes_old - rfreevnodes; 1616 if (slop < VNLRU_FREEVNODES_SLOP) 1617 return (rfreevnodes >= 0 ? rfreevnodes : 0); 1618 CPU_FOREACH(cpu) { 1619 rfreevnodes += cpuid_to_pcpu[cpu]->pc_vfs_freevnodes; 1620 } 1621 atomic_store_long(&freevnodes_old, rfreevnodes); 1622 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1623 } 1624 1625 static bool 1626 vnlru_under(u_long rnumvnodes, u_long limit) 1627 { 1628 u_long rfreevnodes, space; 1629 1630 if (__predict_false(rnumvnodes > desiredvnodes)) 1631 return (true); 1632 1633 space = desiredvnodes - rnumvnodes; 1634 if (space < limit) { 1635 rfreevnodes = vnlru_read_freevnodes(); 1636 if (rfreevnodes > wantfreevnodes) 1637 space += rfreevnodes - wantfreevnodes; 1638 } 1639 return (space < limit); 1640 } 1641 1642 static void 1643 vnlru_kick_locked(void) 1644 { 1645 1646 mtx_assert(&vnode_list_mtx, MA_OWNED); 1647 if (vnlruproc_sig == 0) { 1648 vnlruproc_sig = 1; 1649 vnlruproc_kicks++; 1650 wakeup(vnlruproc); 1651 } 1652 } 1653 1654 static void 1655 vnlru_kick_cond(void) 1656 { 1657 1658 if (vnlru_read_freevnodes() > wantfreevnodes) 1659 return; 1660 1661 if (vnlruproc_sig) 1662 return; 1663 mtx_lock(&vnode_list_mtx); 1664 vnlru_kick_locked(); 1665 mtx_unlock(&vnode_list_mtx); 1666 } 1667 1668 static void 1669 vnlru_proc_sleep(void) 1670 { 1671 1672 if (vnlruproc_sig) { 1673 vnlruproc_sig = 0; 1674 wakeup(&vnlruproc_sig); 1675 } 1676 msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); 1677 } 1678 1679 /* 1680 * A lighter version of the machinery below. 1681 * 1682 * Tries to reach goals only by recycling free vnodes and does not invoke 1683 * uma_reclaim(UMA_RECLAIM_DRAIN). 1684 * 1685 * This works around pathological behavior in vnlru in presence of tons of free 1686 * vnodes, but without having to rewrite the machinery at this time. Said 1687 * behavior boils down to continuously trying to reclaim all kinds of vnodes 1688 * (cycling through all levels of "force") when the count is transiently above 1689 * limit. This happens a lot when all vnodes are used up and vn_alloc 1690 * speculatively increments the counter. 1691 * 1692 * Sample testcase: vnode limit 8388608, 20 separate directory trees each with 1693 * 1 million files in total and 20 find(1) processes stating them in parallel 1694 * (one per each tree). 1695 * 1696 * On a kernel with only stock machinery this needs anywhere between 60 and 120 1697 * seconds to execute (time varies *wildly* between runs). With the workaround 1698 * it consistently stays around 20 seconds [it got further down with later 1699 * changes]. 1700 * 1701 * That is to say the entire thing needs a fundamental redesign (most notably 1702 * to accommodate faster recycling), the above only tries to get it ouf the way. 1703 * 1704 * Return values are: 1705 * -1 -- fallback to regular vnlru loop 1706 * 0 -- do nothing, go to sleep 1707 * >0 -- recycle this many vnodes 1708 */ 1709 static long 1710 vnlru_proc_light_pick(void) 1711 { 1712 u_long rnumvnodes, rfreevnodes; 1713 1714 if (vstir || vnlruproc_sig == 1) 1715 return (-1); 1716 1717 rnumvnodes = atomic_load_long(&numvnodes); 1718 rfreevnodes = vnlru_read_freevnodes(); 1719 1720 /* 1721 * vnode limit might have changed and now we may be at a significant 1722 * excess. Bail if we can't sort it out with free vnodes. 1723 * 1724 * Due to atomic updates the count can legitimately go above 1725 * the limit for a short period, don't bother doing anything in 1726 * that case. 1727 */ 1728 if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP + 10) { 1729 if (rnumvnodes - rfreevnodes >= desiredvnodes || 1730 rfreevnodes <= wantfreevnodes) { 1731 return (-1); 1732 } 1733 1734 return (rnumvnodes - desiredvnodes); 1735 } 1736 1737 /* 1738 * Don't try to reach wantfreevnodes target if there are too few vnodes 1739 * to begin with. 1740 */ 1741 if (rnumvnodes < wantfreevnodes) { 1742 return (0); 1743 } 1744 1745 if (rfreevnodes < wantfreevnodes) { 1746 return (-1); 1747 } 1748 1749 return (0); 1750 } 1751 1752 static bool 1753 vnlru_proc_light(void) 1754 { 1755 long freecount; 1756 1757 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1758 1759 freecount = vnlru_proc_light_pick(); 1760 if (freecount == -1) 1761 return (false); 1762 1763 if (freecount != 0) { 1764 vnlru_free_vnlru(freecount); 1765 } 1766 1767 mtx_lock(&vnode_list_mtx); 1768 vnlru_proc_sleep(); 1769 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1770 return (true); 1771 } 1772 1773 static u_long uma_reclaim_calls; 1774 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, uma_reclaim_calls, CTLFLAG_RD | CTLFLAG_STATS, 1775 &uma_reclaim_calls, 0, "Number of calls to uma_reclaim"); 1776 1777 static void 1778 vnlru_proc(void) 1779 { 1780 u_long rnumvnodes, rfreevnodes, target; 1781 unsigned long onumvnodes; 1782 int done, force, trigger, usevnodes; 1783 bool reclaim_nc_src, want_reread; 1784 1785 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1786 SHUTDOWN_PRI_FIRST); 1787 1788 force = 0; 1789 want_reread = false; 1790 for (;;) { 1791 kproc_suspend_check(vnlruproc); 1792 1793 if (force == 0 && vnlru_proc_light()) 1794 continue; 1795 1796 mtx_lock(&vnode_list_mtx); 1797 rnumvnodes = atomic_load_long(&numvnodes); 1798 1799 if (want_reread) { 1800 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1801 want_reread = false; 1802 } 1803 1804 /* 1805 * If numvnodes is too large (due to desiredvnodes being 1806 * adjusted using its sysctl, or emergency growth), first 1807 * try to reduce it by discarding free vnodes. 1808 */ 1809 if (rnumvnodes > desiredvnodes + 10) { 1810 vnlru_free_locked_vnlru(rnumvnodes - desiredvnodes); 1811 mtx_lock(&vnode_list_mtx); 1812 rnumvnodes = atomic_load_long(&numvnodes); 1813 } 1814 /* 1815 * Sleep if the vnode cache is in a good state. This is 1816 * when it is not over-full and has space for about a 4% 1817 * or 9% expansion (by growing its size or inexcessively 1818 * reducing free vnode count). Otherwise, try to reclaim 1819 * space for a 10% expansion. 1820 */ 1821 if (vstir && force == 0) { 1822 force = 1; 1823 vstir = false; 1824 } 1825 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1826 vnlru_proc_sleep(); 1827 continue; 1828 } 1829 rfreevnodes = vnlru_read_freevnodes(); 1830 1831 onumvnodes = rnumvnodes; 1832 /* 1833 * Calculate parameters for recycling. These are the same 1834 * throughout the loop to give some semblance of fairness. 1835 * The trigger point is to avoid recycling vnodes with lots 1836 * of resident pages. We aren't trying to free memory; we 1837 * are trying to recycle or at least free vnodes. 1838 */ 1839 if (rnumvnodes <= desiredvnodes) 1840 usevnodes = rnumvnodes - rfreevnodes; 1841 else 1842 usevnodes = rnumvnodes; 1843 if (usevnodes <= 0) 1844 usevnodes = 1; 1845 /* 1846 * The trigger value is chosen to give a conservatively 1847 * large value to ensure that it alone doesn't prevent 1848 * making progress. The value can easily be so large that 1849 * it is effectively infinite in some congested and 1850 * misconfigured cases, and this is necessary. Normally 1851 * it is about 8 to 100 (pages), which is quite large. 1852 */ 1853 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1854 if (force < 2) 1855 trigger = vsmalltrigger; 1856 reclaim_nc_src = force >= 3; 1857 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1858 target = target / 10 + 1; 1859 done = vlrureclaim(reclaim_nc_src, trigger, target); 1860 mtx_unlock(&vnode_list_mtx); 1861 /* 1862 * Total number of vnodes can transiently go slightly above the 1863 * limit (see vn_alloc_hard), no need to call uma_reclaim if 1864 * this happens. 1865 */ 1866 if (onumvnodes + VNLRU_COUNT_SLOP + 1000 > desiredvnodes && 1867 numvnodes <= desiredvnodes) { 1868 uma_reclaim_calls++; 1869 uma_reclaim(UMA_RECLAIM_DRAIN); 1870 } 1871 if (done == 0) { 1872 if (force == 0 || force == 1) { 1873 force = 2; 1874 continue; 1875 } 1876 if (force == 2) { 1877 force = 3; 1878 continue; 1879 } 1880 want_reread = true; 1881 force = 0; 1882 vnlru_nowhere++; 1883 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1884 } else { 1885 want_reread = true; 1886 kern_yield(PRI_USER); 1887 } 1888 } 1889 } 1890 1891 static struct kproc_desc vnlru_kp = { 1892 "vnlru", 1893 vnlru_proc, 1894 &vnlruproc 1895 }; 1896 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1897 &vnlru_kp); 1898 1899 /* 1900 * Routines having to do with the management of the vnode table. 1901 */ 1902 1903 /* 1904 * Try to recycle a freed vnode. 1905 */ 1906 static int 1907 vtryrecycle(struct vnode *vp, bool isvnlru) 1908 { 1909 struct mount *vnmp; 1910 1911 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1912 VNPASS(vp->v_holdcnt > 0, vp); 1913 /* 1914 * This vnode may found and locked via some other list, if so we 1915 * can't recycle it yet. 1916 */ 1917 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1918 CTR2(KTR_VFS, 1919 "%s: impossible to recycle, vp %p lock is already held", 1920 __func__, vp); 1921 vdrop_recycle(vp); 1922 return (EWOULDBLOCK); 1923 } 1924 /* 1925 * Don't recycle if its filesystem is being suspended. 1926 */ 1927 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1928 VOP_UNLOCK(vp); 1929 CTR2(KTR_VFS, 1930 "%s: impossible to recycle, cannot start the write for %p", 1931 __func__, vp); 1932 vdrop_recycle(vp); 1933 return (EBUSY); 1934 } 1935 /* 1936 * If we got this far, we need to acquire the interlock and see if 1937 * anyone picked up this vnode from another list. If not, we will 1938 * mark it with DOOMED via vgonel() so that anyone who does find it 1939 * will skip over it. 1940 */ 1941 VI_LOCK(vp); 1942 if (vp->v_usecount) { 1943 VOP_UNLOCK(vp); 1944 vdropl_recycle(vp); 1945 vn_finished_write(vnmp); 1946 CTR2(KTR_VFS, 1947 "%s: impossible to recycle, %p is already referenced", 1948 __func__, vp); 1949 return (EBUSY); 1950 } 1951 if (!VN_IS_DOOMED(vp)) { 1952 if (isvnlru) 1953 recycles_free_count++; 1954 else 1955 counter_u64_add(direct_recycles_free_count, 1); 1956 vgonel(vp); 1957 } 1958 VOP_UNLOCK(vp); 1959 vdropl_recycle(vp); 1960 vn_finished_write(vnmp); 1961 return (0); 1962 } 1963 1964 /* 1965 * Allocate a new vnode. 1966 * 1967 * The operation never returns an error. Returning an error was disabled 1968 * in r145385 (dated 2005) with the following comment: 1969 * 1970 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1971 * 1972 * Given the age of this commit (almost 15 years at the time of writing this 1973 * comment) restoring the ability to fail requires a significant audit of 1974 * all codepaths. 1975 * 1976 * The routine can try to free a vnode or stall for up to 1 second waiting for 1977 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1978 */ 1979 static u_long vn_alloc_cyclecount; 1980 static u_long vn_alloc_sleeps; 1981 1982 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, alloc_sleeps, CTLFLAG_RD, &vn_alloc_sleeps, 0, 1983 "Number of times vnode allocation blocked waiting on vnlru"); 1984 1985 static struct vnode * __noinline 1986 vn_alloc_hard(struct mount *mp, u_long rnumvnodes, bool bumped) 1987 { 1988 u_long rfreevnodes; 1989 1990 if (bumped) { 1991 if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP) { 1992 atomic_subtract_long(&numvnodes, 1); 1993 bumped = false; 1994 } 1995 } 1996 1997 mtx_lock(&vnode_list_mtx); 1998 1999 rfreevnodes = vnlru_read_freevnodes(); 2000 if (vn_alloc_cyclecount++ >= rfreevnodes) { 2001 vn_alloc_cyclecount = 0; 2002 vstir = true; 2003 } 2004 /* 2005 * Grow the vnode cache if it will not be above its target max after 2006 * growing. Otherwise, if there is at least one free vnode, try to 2007 * reclaim 1 item from it before growing the cache (possibly above its 2008 * target max if the reclamation failed or is delayed). 2009 */ 2010 if (vnlru_free_locked_direct(1) > 0) 2011 goto alloc; 2012 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 2013 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2014 /* 2015 * Wait for space for a new vnode. 2016 */ 2017 if (bumped) { 2018 atomic_subtract_long(&numvnodes, 1); 2019 bumped = false; 2020 } 2021 mtx_lock(&vnode_list_mtx); 2022 vnlru_kick_locked(); 2023 vn_alloc_sleeps++; 2024 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 2025 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 2026 vnlru_read_freevnodes() > 1) 2027 vnlru_free_locked_direct(1); 2028 else 2029 mtx_unlock(&vnode_list_mtx); 2030 } 2031 alloc: 2032 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 2033 if (!bumped) 2034 atomic_add_long(&numvnodes, 1); 2035 vnlru_kick_cond(); 2036 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 2037 } 2038 2039 static struct vnode * 2040 vn_alloc(struct mount *mp) 2041 { 2042 u_long rnumvnodes; 2043 2044 if (__predict_false(vn_alloc_cyclecount != 0)) 2045 return (vn_alloc_hard(mp, 0, false)); 2046 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 2047 if (__predict_false(vnlru_under(rnumvnodes, vlowat))) { 2048 return (vn_alloc_hard(mp, rnumvnodes, true)); 2049 } 2050 2051 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 2052 } 2053 2054 static void 2055 vn_free(struct vnode *vp) 2056 { 2057 2058 atomic_subtract_long(&numvnodes, 1); 2059 uma_zfree_smr(vnode_zone, vp); 2060 } 2061 2062 /* 2063 * Allocate a new vnode. 2064 */ 2065 int 2066 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 2067 struct vnode **vpp) 2068 { 2069 struct vnode *vp; 2070 struct thread *td; 2071 struct lock_object *lo; 2072 2073 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 2074 2075 KASSERT(vops->registered, 2076 ("%s: not registered vector op %p\n", __func__, vops)); 2077 cache_validate_vop_vector(mp, vops); 2078 2079 td = curthread; 2080 if (td->td_vp_reserved != NULL) { 2081 vp = td->td_vp_reserved; 2082 td->td_vp_reserved = NULL; 2083 } else { 2084 vp = vn_alloc(mp); 2085 } 2086 counter_u64_add(vnodes_created, 1); 2087 2088 vn_set_state(vp, VSTATE_UNINITIALIZED); 2089 2090 /* 2091 * Locks are given the generic name "vnode" when created. 2092 * Follow the historic practice of using the filesystem 2093 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 2094 * 2095 * Locks live in a witness group keyed on their name. Thus, 2096 * when a lock is renamed, it must also move from the witness 2097 * group of its old name to the witness group of its new name. 2098 * 2099 * The change only needs to be made when the vnode moves 2100 * from one filesystem type to another. We ensure that each 2101 * filesystem use a single static name pointer for its tag so 2102 * that we can compare pointers rather than doing a strcmp(). 2103 */ 2104 lo = &vp->v_vnlock->lock_object; 2105 #ifdef WITNESS 2106 if (lo->lo_name != tag) { 2107 #endif 2108 lo->lo_name = tag; 2109 #ifdef WITNESS 2110 WITNESS_DESTROY(lo); 2111 WITNESS_INIT(lo, tag); 2112 } 2113 #endif 2114 /* 2115 * By default, don't allow shared locks unless filesystems opt-in. 2116 */ 2117 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 2118 /* 2119 * Finalize various vnode identity bits. 2120 */ 2121 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 2122 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 2123 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 2124 vp->v_type = VNON; 2125 vp->v_op = vops; 2126 vp->v_irflag = 0; 2127 v_init_counters(vp); 2128 vn_seqc_init(vp); 2129 vp->v_bufobj.bo_ops = &buf_ops_bio; 2130 #ifdef DIAGNOSTIC 2131 if (mp == NULL && vops != &dead_vnodeops) 2132 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 2133 #endif 2134 #ifdef MAC 2135 mac_vnode_init(vp); 2136 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 2137 mac_vnode_associate_singlelabel(mp, vp); 2138 #endif 2139 if (mp != NULL) { 2140 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 2141 } 2142 2143 /* 2144 * For the filesystems which do not use vfs_hash_insert(), 2145 * still initialize v_hash to have vfs_hash_index() useful. 2146 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 2147 * its own hashing. 2148 */ 2149 vp->v_hash = (uintptr_t)vp >> vnsz2log; 2150 2151 *vpp = vp; 2152 return (0); 2153 } 2154 2155 void 2156 getnewvnode_reserve(void) 2157 { 2158 struct thread *td; 2159 2160 td = curthread; 2161 MPASS(td->td_vp_reserved == NULL); 2162 td->td_vp_reserved = vn_alloc(NULL); 2163 } 2164 2165 void 2166 getnewvnode_drop_reserve(void) 2167 { 2168 struct thread *td; 2169 2170 td = curthread; 2171 if (td->td_vp_reserved != NULL) { 2172 vn_free(td->td_vp_reserved); 2173 td->td_vp_reserved = NULL; 2174 } 2175 } 2176 2177 static void __noinline 2178 freevnode(struct vnode *vp) 2179 { 2180 struct bufobj *bo; 2181 2182 /* 2183 * The vnode has been marked for destruction, so free it. 2184 * 2185 * The vnode will be returned to the zone where it will 2186 * normally remain until it is needed for another vnode. We 2187 * need to cleanup (or verify that the cleanup has already 2188 * been done) any residual data left from its current use 2189 * so as not to contaminate the freshly allocated vnode. 2190 */ 2191 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2192 /* 2193 * Paired with vgone. 2194 */ 2195 vn_seqc_write_end_free(vp); 2196 2197 bo = &vp->v_bufobj; 2198 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2199 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 2200 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2201 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2202 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2203 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2204 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 2205 ("clean blk trie not empty")); 2206 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2207 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 2208 ("dirty blk trie not empty")); 2209 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, 2210 ("Leaked inactivation")); 2211 VI_UNLOCK(vp); 2212 cache_assert_no_entries(vp); 2213 2214 #ifdef MAC 2215 mac_vnode_destroy(vp); 2216 #endif 2217 if (vp->v_pollinfo != NULL) { 2218 /* 2219 * Use LK_NOWAIT to shut up witness about the lock. We may get 2220 * here while having another vnode locked when trying to 2221 * satisfy a lookup and needing to recycle. 2222 */ 2223 VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); 2224 destroy_vpollinfo(vp->v_pollinfo); 2225 VOP_UNLOCK(vp); 2226 vp->v_pollinfo = NULL; 2227 } 2228 vp->v_mountedhere = NULL; 2229 vp->v_unpcb = NULL; 2230 vp->v_rdev = NULL; 2231 vp->v_fifoinfo = NULL; 2232 vp->v_iflag = 0; 2233 vp->v_vflag = 0; 2234 bo->bo_flag = 0; 2235 vn_free(vp); 2236 } 2237 2238 /* 2239 * Delete from old mount point vnode list, if on one. 2240 */ 2241 static void 2242 delmntque(struct vnode *vp) 2243 { 2244 struct mount *mp; 2245 2246 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 2247 2248 mp = vp->v_mount; 2249 MNT_ILOCK(mp); 2250 VI_LOCK(vp); 2251 vp->v_mount = NULL; 2252 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 2253 ("bad mount point vnode list size")); 2254 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2255 mp->mnt_nvnodelistsize--; 2256 MNT_REL(mp); 2257 MNT_IUNLOCK(mp); 2258 /* 2259 * The caller expects the interlock to be still held. 2260 */ 2261 ASSERT_VI_LOCKED(vp, __func__); 2262 } 2263 2264 static int 2265 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) 2266 { 2267 2268 KASSERT(vp->v_mount == NULL, 2269 ("insmntque: vnode already on per mount vnode list")); 2270 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 2271 if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { 2272 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 2273 } else { 2274 KASSERT(!dtr, 2275 ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", 2276 __func__)); 2277 } 2278 2279 /* 2280 * We acquire the vnode interlock early to ensure that the 2281 * vnode cannot be recycled by another process releasing a 2282 * holdcnt on it before we get it on both the vnode list 2283 * and the active vnode list. The mount mutex protects only 2284 * manipulation of the vnode list and the vnode freelist 2285 * mutex protects only manipulation of the active vnode list. 2286 * Hence the need to hold the vnode interlock throughout. 2287 */ 2288 MNT_ILOCK(mp); 2289 VI_LOCK(vp); 2290 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 2291 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 2292 mp->mnt_nvnodelistsize == 0)) && 2293 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 2294 VI_UNLOCK(vp); 2295 MNT_IUNLOCK(mp); 2296 if (dtr) { 2297 vp->v_data = NULL; 2298 vp->v_op = &dead_vnodeops; 2299 vgone(vp); 2300 vput(vp); 2301 } 2302 return (EBUSY); 2303 } 2304 vp->v_mount = mp; 2305 MNT_REF(mp); 2306 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2307 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 2308 ("neg mount point vnode list size")); 2309 mp->mnt_nvnodelistsize++; 2310 VI_UNLOCK(vp); 2311 MNT_IUNLOCK(mp); 2312 return (0); 2313 } 2314 2315 /* 2316 * Insert into list of vnodes for the new mount point, if available. 2317 * insmntque() reclaims the vnode on insertion failure, insmntque1() 2318 * leaves handling of the vnode to the caller. 2319 */ 2320 int 2321 insmntque(struct vnode *vp, struct mount *mp) 2322 { 2323 return (insmntque1_int(vp, mp, true)); 2324 } 2325 2326 int 2327 insmntque1(struct vnode *vp, struct mount *mp) 2328 { 2329 return (insmntque1_int(vp, mp, false)); 2330 } 2331 2332 /* 2333 * Flush out and invalidate all buffers associated with a bufobj 2334 * Called with the underlying object locked. 2335 */ 2336 int 2337 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 2338 { 2339 int error; 2340 2341 BO_LOCK(bo); 2342 if (flags & V_SAVE) { 2343 error = bufobj_wwait(bo, slpflag, slptimeo); 2344 if (error) { 2345 BO_UNLOCK(bo); 2346 return (error); 2347 } 2348 if (bo->bo_dirty.bv_cnt > 0) { 2349 BO_UNLOCK(bo); 2350 do { 2351 error = BO_SYNC(bo, MNT_WAIT); 2352 } while (error == ERELOOKUP); 2353 if (error != 0) 2354 return (error); 2355 BO_LOCK(bo); 2356 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { 2357 BO_UNLOCK(bo); 2358 return (EBUSY); 2359 } 2360 } 2361 } 2362 /* 2363 * If you alter this loop please notice that interlock is dropped and 2364 * reacquired in flushbuflist. Special care is needed to ensure that 2365 * no race conditions occur from this. 2366 */ 2367 do { 2368 error = flushbuflist(&bo->bo_clean, 2369 flags, bo, slpflag, slptimeo); 2370 if (error == 0 && !(flags & V_CLEANONLY)) 2371 error = flushbuflist(&bo->bo_dirty, 2372 flags, bo, slpflag, slptimeo); 2373 if (error != 0 && error != EAGAIN) { 2374 BO_UNLOCK(bo); 2375 return (error); 2376 } 2377 } while (error != 0); 2378 2379 /* 2380 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 2381 * have write I/O in-progress but if there is a VM object then the 2382 * VM object can also have read-I/O in-progress. 2383 */ 2384 do { 2385 bufobj_wwait(bo, 0, 0); 2386 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 2387 BO_UNLOCK(bo); 2388 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 2389 BO_LOCK(bo); 2390 } 2391 } while (bo->bo_numoutput > 0); 2392 BO_UNLOCK(bo); 2393 2394 /* 2395 * Destroy the copy in the VM cache, too. 2396 */ 2397 if (bo->bo_object != NULL && 2398 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 2399 VM_OBJECT_WLOCK(bo->bo_object); 2400 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 2401 OBJPR_CLEANONLY : 0); 2402 VM_OBJECT_WUNLOCK(bo->bo_object); 2403 } 2404 2405 #ifdef INVARIANTS 2406 BO_LOCK(bo); 2407 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 2408 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 2409 bo->bo_clean.bv_cnt > 0)) 2410 panic("vinvalbuf: flush failed"); 2411 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 2412 bo->bo_dirty.bv_cnt > 0) 2413 panic("vinvalbuf: flush dirty failed"); 2414 BO_UNLOCK(bo); 2415 #endif 2416 return (0); 2417 } 2418 2419 /* 2420 * Flush out and invalidate all buffers associated with a vnode. 2421 * Called with the underlying object locked. 2422 */ 2423 int 2424 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 2425 { 2426 2427 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2428 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 2429 if (vp->v_object != NULL && vp->v_object->handle != vp) 2430 return (0); 2431 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 2432 } 2433 2434 /* 2435 * Flush out buffers on the specified list. 2436 * 2437 */ 2438 static int 2439 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 2440 int slptimeo) 2441 { 2442 struct buf *bp, *nbp; 2443 int retval, error; 2444 daddr_t lblkno; 2445 b_xflags_t xflags; 2446 2447 ASSERT_BO_WLOCKED(bo); 2448 2449 retval = 0; 2450 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2451 /* 2452 * If we are flushing both V_NORMAL and V_ALT buffers then 2453 * do not skip any buffers. If we are flushing only V_NORMAL 2454 * buffers then skip buffers marked as BX_ALTDATA. If we are 2455 * flushing only V_ALT buffers then skip buffers not marked 2456 * as BX_ALTDATA. 2457 */ 2458 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2459 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2460 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2461 continue; 2462 } 2463 if (nbp != NULL) { 2464 lblkno = nbp->b_lblkno; 2465 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2466 } 2467 retval = EAGAIN; 2468 error = BUF_TIMELOCK(bp, 2469 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2470 "flushbuf", slpflag, slptimeo); 2471 if (error) { 2472 BO_LOCK(bo); 2473 return (error != ENOLCK ? error : EAGAIN); 2474 } 2475 KASSERT(bp->b_bufobj == bo, 2476 ("bp %p wrong b_bufobj %p should be %p", 2477 bp, bp->b_bufobj, bo)); 2478 /* 2479 * XXX Since there are no node locks for NFS, I 2480 * believe there is a slight chance that a delayed 2481 * write will occur while sleeping just above, so 2482 * check for it. 2483 */ 2484 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2485 (flags & V_SAVE)) { 2486 bremfree(bp); 2487 bp->b_flags |= B_ASYNC; 2488 bwrite(bp); 2489 BO_LOCK(bo); 2490 return (EAGAIN); /* XXX: why not loop ? */ 2491 } 2492 bremfree(bp); 2493 bp->b_flags |= (B_INVAL | B_RELBUF); 2494 bp->b_flags &= ~B_ASYNC; 2495 brelse(bp); 2496 BO_LOCK(bo); 2497 if (nbp == NULL) 2498 break; 2499 nbp = gbincore(bo, lblkno); 2500 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2501 != xflags) 2502 break; /* nbp invalid */ 2503 } 2504 return (retval); 2505 } 2506 2507 int 2508 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2509 { 2510 struct buf *bp; 2511 int error; 2512 daddr_t lblkno; 2513 2514 ASSERT_BO_LOCKED(bo); 2515 2516 for (lblkno = startn;;) { 2517 again: 2518 bp = buf_lookup_ge(bufv, lblkno); 2519 if (bp == NULL || bp->b_lblkno >= endn) 2520 break; 2521 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2522 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2523 if (error != 0) { 2524 BO_RLOCK(bo); 2525 if (error == ENOLCK) 2526 goto again; 2527 return (error); 2528 } 2529 KASSERT(bp->b_bufobj == bo, 2530 ("bp %p wrong b_bufobj %p should be %p", 2531 bp, bp->b_bufobj, bo)); 2532 lblkno = bp->b_lblkno + 1; 2533 if ((bp->b_flags & B_MANAGED) == 0) 2534 bremfree(bp); 2535 bp->b_flags |= B_RELBUF; 2536 /* 2537 * In the VMIO case, use the B_NOREUSE flag to hint that the 2538 * pages backing each buffer in the range are unlikely to be 2539 * reused. Dirty buffers will have the hint applied once 2540 * they've been written. 2541 */ 2542 if ((bp->b_flags & B_VMIO) != 0) 2543 bp->b_flags |= B_NOREUSE; 2544 brelse(bp); 2545 BO_RLOCK(bo); 2546 } 2547 return (0); 2548 } 2549 2550 /* 2551 * Truncate a file's buffer and pages to a specified length. This 2552 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2553 * sync activity. 2554 */ 2555 int 2556 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2557 { 2558 struct buf *bp, *nbp; 2559 struct bufobj *bo; 2560 daddr_t startlbn; 2561 2562 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2563 vp, blksize, (uintmax_t)length); 2564 2565 /* 2566 * Round up to the *next* lbn. 2567 */ 2568 startlbn = howmany(length, blksize); 2569 2570 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2571 2572 bo = &vp->v_bufobj; 2573 restart_unlocked: 2574 BO_LOCK(bo); 2575 2576 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2577 ; 2578 2579 if (length > 0) { 2580 /* 2581 * Write out vnode metadata, e.g. indirect blocks. 2582 */ 2583 restartsync: 2584 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2585 if (bp->b_lblkno >= 0) 2586 continue; 2587 /* 2588 * Since we hold the vnode lock this should only 2589 * fail if we're racing with the buf daemon. 2590 */ 2591 if (BUF_LOCK(bp, 2592 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2593 BO_LOCKPTR(bo)) == ENOLCK) 2594 goto restart_unlocked; 2595 2596 VNASSERT((bp->b_flags & B_DELWRI), vp, 2597 ("buf(%p) on dirty queue without DELWRI", bp)); 2598 2599 bremfree(bp); 2600 bawrite(bp); 2601 BO_LOCK(bo); 2602 goto restartsync; 2603 } 2604 } 2605 2606 bufobj_wwait(bo, 0, 0); 2607 BO_UNLOCK(bo); 2608 vnode_pager_setsize(vp, length); 2609 2610 return (0); 2611 } 2612 2613 /* 2614 * Invalidate the cached pages of a file's buffer within the range of block 2615 * numbers [startlbn, endlbn). 2616 */ 2617 void 2618 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2619 int blksize) 2620 { 2621 struct bufobj *bo; 2622 off_t start, end; 2623 2624 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2625 2626 start = blksize * startlbn; 2627 end = blksize * endlbn; 2628 2629 bo = &vp->v_bufobj; 2630 BO_LOCK(bo); 2631 MPASS(blksize == bo->bo_bsize); 2632 2633 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2634 ; 2635 2636 BO_UNLOCK(bo); 2637 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2638 } 2639 2640 static int 2641 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2642 daddr_t startlbn, daddr_t endlbn) 2643 { 2644 struct bufv *bv; 2645 struct buf *bp, *nbp; 2646 uint8_t anyfreed; 2647 bool clean; 2648 2649 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2650 ASSERT_BO_LOCKED(bo); 2651 2652 anyfreed = 1; 2653 clean = true; 2654 do { 2655 bv = clean ? &bo->bo_clean : &bo->bo_dirty; 2656 bp = buf_lookup_ge(bv, startlbn); 2657 if (bp == NULL) 2658 continue; 2659 TAILQ_FOREACH_FROM_SAFE(bp, &bv->bv_hd, b_bobufs, nbp) { 2660 if (bp->b_lblkno >= endlbn) 2661 break; 2662 if (BUF_LOCK(bp, 2663 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2664 BO_LOCKPTR(bo)) == ENOLCK) { 2665 BO_LOCK(bo); 2666 return (EAGAIN); 2667 } 2668 2669 bremfree(bp); 2670 bp->b_flags |= B_INVAL | B_RELBUF; 2671 bp->b_flags &= ~B_ASYNC; 2672 brelse(bp); 2673 anyfreed = 2; 2674 2675 BO_LOCK(bo); 2676 if (nbp != NULL && 2677 (((nbp->b_xflags & 2678 (clean ? BX_VNCLEAN : BX_VNDIRTY)) == 0) || 2679 nbp->b_vp != vp || 2680 (nbp->b_flags & B_DELWRI) == (clean? B_DELWRI: 0))) 2681 return (EAGAIN); 2682 } 2683 } while (clean = !clean, anyfreed-- > 0); 2684 return (0); 2685 } 2686 2687 static void 2688 buf_vlist_remove(struct buf *bp) 2689 { 2690 struct bufv *bv; 2691 b_xflags_t flags; 2692 2693 flags = bp->b_xflags; 2694 2695 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2696 ASSERT_BO_WLOCKED(bp->b_bufobj); 2697 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2698 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2699 ("%s: buffer %p has invalid queue state", __func__, bp)); 2700 2701 if ((flags & BX_VNDIRTY) != 0) 2702 bv = &bp->b_bufobj->bo_dirty; 2703 else 2704 bv = &bp->b_bufobj->bo_clean; 2705 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2706 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2707 bv->bv_cnt--; 2708 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2709 } 2710 2711 /* 2712 * Add the buffer to the sorted clean or dirty block list. Return zero on 2713 * success, EEXIST if a buffer with this identity already exists, or another 2714 * error on allocation failure. 2715 */ 2716 static inline int 2717 buf_vlist_find_or_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2718 { 2719 struct bufv *bv; 2720 struct buf *n; 2721 int error; 2722 2723 ASSERT_BO_WLOCKED(bo); 2724 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2725 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2726 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2727 ("dead bo %p", bo)); 2728 KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == xflags, 2729 ("buf_vlist_add: b_xflags %#x not set on bp %p", xflags, bp)); 2730 2731 if (xflags & BX_VNDIRTY) 2732 bv = &bo->bo_dirty; 2733 else 2734 bv = &bo->bo_clean; 2735 2736 error = buf_insert_lookup_le(bv, bp, &n); 2737 if (n == NULL) { 2738 KASSERT(error != EEXIST, 2739 ("buf_vlist_add: EEXIST but no existing buf found: bp %p", 2740 bp)); 2741 } else { 2742 KASSERT(n->b_lblkno <= bp->b_lblkno, 2743 ("buf_vlist_add: out of order insert/lookup: bp %p n %p", 2744 bp, n)); 2745 KASSERT((n->b_lblkno == bp->b_lblkno) == (error == EEXIST), 2746 ("buf_vlist_add: inconsistent result for existing buf: " 2747 "error %d bp %p n %p", error, bp, n)); 2748 } 2749 if (error != 0) 2750 return (error); 2751 2752 /* Keep the list ordered. */ 2753 if (n == NULL) { 2754 KASSERT(TAILQ_EMPTY(&bv->bv_hd) || 2755 bp->b_lblkno < TAILQ_FIRST(&bv->bv_hd)->b_lblkno, 2756 ("buf_vlist_add: queue order: " 2757 "%p should be before first %p", 2758 bp, TAILQ_FIRST(&bv->bv_hd))); 2759 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2760 } else { 2761 KASSERT(TAILQ_NEXT(n, b_bobufs) == NULL || 2762 bp->b_lblkno < TAILQ_NEXT(n, b_bobufs)->b_lblkno, 2763 ("buf_vlist_add: queue order: " 2764 "%p should be before next %p", 2765 bp, TAILQ_NEXT(n, b_bobufs))); 2766 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2767 } 2768 2769 bv->bv_cnt++; 2770 return (0); 2771 } 2772 2773 /* 2774 * Add the buffer to the sorted clean or dirty block list. 2775 * 2776 * NOTE: xflags is passed as a constant, optimizing this inline function! 2777 */ 2778 static void 2779 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2780 { 2781 int error; 2782 2783 KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0, 2784 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2785 bp->b_xflags |= xflags; 2786 error = buf_vlist_find_or_add(bp, bo, xflags); 2787 if (error) 2788 panic("buf_vlist_add: error=%d", error); 2789 } 2790 2791 /* 2792 * Look up a buffer using the buffer tries. 2793 */ 2794 struct buf * 2795 gbincore(struct bufobj *bo, daddr_t lblkno) 2796 { 2797 struct buf *bp; 2798 2799 ASSERT_BO_LOCKED(bo); 2800 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2801 if (bp != NULL) 2802 return (bp); 2803 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2804 } 2805 2806 /* 2807 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2808 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2809 * stability of the result. Like other lockless lookups, the found buf may 2810 * already be invalid by the time this function returns. 2811 */ 2812 struct buf * 2813 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2814 { 2815 struct buf *bp; 2816 2817 ASSERT_BO_UNLOCKED(bo); 2818 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2819 if (bp != NULL) 2820 return (bp); 2821 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2822 } 2823 2824 /* 2825 * Associate a buffer with a vnode. 2826 */ 2827 int 2828 bgetvp(struct vnode *vp, struct buf *bp) 2829 { 2830 struct bufobj *bo; 2831 int error; 2832 2833 bo = &vp->v_bufobj; 2834 ASSERT_BO_UNLOCKED(bo); 2835 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2836 2837 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2838 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2839 ("bgetvp: bp already attached! %p", bp)); 2840 2841 /* 2842 * Add the buf to the vnode's clean list unless we lost a race and find 2843 * an existing buf in either dirty or clean. 2844 */ 2845 bp->b_vp = vp; 2846 bp->b_bufobj = bo; 2847 bp->b_xflags |= BX_VNCLEAN; 2848 error = EEXIST; 2849 BO_LOCK(bo); 2850 if (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, bp->b_lblkno) == NULL) 2851 error = buf_vlist_find_or_add(bp, bo, BX_VNCLEAN); 2852 BO_UNLOCK(bo); 2853 if (__predict_true(error == 0)) { 2854 vhold(vp); 2855 return (0); 2856 } 2857 if (error != EEXIST) 2858 panic("bgetvp: buf_vlist_add error: %d", error); 2859 bp->b_vp = NULL; 2860 bp->b_bufobj = NULL; 2861 bp->b_xflags &= ~BX_VNCLEAN; 2862 return (error); 2863 } 2864 2865 /* 2866 * Disassociate a buffer from a vnode. 2867 */ 2868 void 2869 brelvp(struct buf *bp) 2870 { 2871 struct bufobj *bo; 2872 struct vnode *vp; 2873 2874 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2875 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2876 2877 /* 2878 * Delete from old vnode list, if on one. 2879 */ 2880 vp = bp->b_vp; /* XXX */ 2881 bo = bp->b_bufobj; 2882 BO_LOCK(bo); 2883 buf_vlist_remove(bp); 2884 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2885 bo->bo_flag &= ~BO_ONWORKLST; 2886 mtx_lock(&sync_mtx); 2887 LIST_REMOVE(bo, bo_synclist); 2888 syncer_worklist_len--; 2889 mtx_unlock(&sync_mtx); 2890 } 2891 bp->b_vp = NULL; 2892 bp->b_bufobj = NULL; 2893 BO_UNLOCK(bo); 2894 vdrop(vp); 2895 } 2896 2897 /* 2898 * Add an item to the syncer work queue. 2899 */ 2900 static void 2901 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2902 { 2903 int slot; 2904 2905 ASSERT_BO_WLOCKED(bo); 2906 2907 mtx_lock(&sync_mtx); 2908 if (bo->bo_flag & BO_ONWORKLST) 2909 LIST_REMOVE(bo, bo_synclist); 2910 else { 2911 bo->bo_flag |= BO_ONWORKLST; 2912 syncer_worklist_len++; 2913 } 2914 2915 if (delay > syncer_maxdelay - 2) 2916 delay = syncer_maxdelay - 2; 2917 slot = (syncer_delayno + delay) & syncer_mask; 2918 2919 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2920 mtx_unlock(&sync_mtx); 2921 } 2922 2923 static int 2924 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2925 { 2926 int error, len; 2927 2928 mtx_lock(&sync_mtx); 2929 len = syncer_worklist_len - sync_vnode_count; 2930 mtx_unlock(&sync_mtx); 2931 error = SYSCTL_OUT(req, &len, sizeof(len)); 2932 return (error); 2933 } 2934 2935 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2936 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2937 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2938 2939 static struct proc *updateproc; 2940 static void sched_sync(void); 2941 static struct kproc_desc up_kp = { 2942 "syncer", 2943 sched_sync, 2944 &updateproc 2945 }; 2946 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2947 2948 static int 2949 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2950 { 2951 struct vnode *vp; 2952 struct mount *mp; 2953 2954 *bo = LIST_FIRST(slp); 2955 if (*bo == NULL) 2956 return (0); 2957 vp = bo2vnode(*bo); 2958 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2959 return (1); 2960 /* 2961 * We use vhold in case the vnode does not 2962 * successfully sync. vhold prevents the vnode from 2963 * going away when we unlock the sync_mtx so that 2964 * we can acquire the vnode interlock. 2965 */ 2966 vholdl(vp); 2967 mtx_unlock(&sync_mtx); 2968 VI_UNLOCK(vp); 2969 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2970 vdrop(vp); 2971 mtx_lock(&sync_mtx); 2972 return (*bo == LIST_FIRST(slp)); 2973 } 2974 MPASSERT(mp == NULL || (curthread->td_pflags & TDP_IGNSUSP) != 0 || 2975 (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0, mp, 2976 ("suspended mp syncing vp %p", vp)); 2977 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2978 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2979 VOP_UNLOCK(vp); 2980 vn_finished_write(mp); 2981 BO_LOCK(*bo); 2982 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2983 /* 2984 * Put us back on the worklist. The worklist 2985 * routine will remove us from our current 2986 * position and then add us back in at a later 2987 * position. 2988 */ 2989 vn_syncer_add_to_worklist(*bo, syncdelay); 2990 } 2991 BO_UNLOCK(*bo); 2992 vdrop(vp); 2993 mtx_lock(&sync_mtx); 2994 return (0); 2995 } 2996 2997 static int first_printf = 1; 2998 2999 /* 3000 * System filesystem synchronizer daemon. 3001 */ 3002 static void 3003 sched_sync(void) 3004 { 3005 struct synclist *next, *slp; 3006 struct bufobj *bo; 3007 long starttime; 3008 struct thread *td = curthread; 3009 int last_work_seen; 3010 int net_worklist_len; 3011 int syncer_final_iter; 3012 int error; 3013 3014 last_work_seen = 0; 3015 syncer_final_iter = 0; 3016 syncer_state = SYNCER_RUNNING; 3017 starttime = time_uptime; 3018 td->td_pflags |= TDP_NORUNNINGBUF; 3019 3020 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 3021 SHUTDOWN_PRI_LAST); 3022 3023 mtx_lock(&sync_mtx); 3024 for (;;) { 3025 if (syncer_state == SYNCER_FINAL_DELAY && 3026 syncer_final_iter == 0) { 3027 mtx_unlock(&sync_mtx); 3028 kproc_suspend_check(td->td_proc); 3029 mtx_lock(&sync_mtx); 3030 } 3031 net_worklist_len = syncer_worklist_len - sync_vnode_count; 3032 if (syncer_state != SYNCER_RUNNING && 3033 starttime != time_uptime) { 3034 if (first_printf) { 3035 printf("\nSyncing disks, vnodes remaining... "); 3036 first_printf = 0; 3037 } 3038 printf("%d ", net_worklist_len); 3039 } 3040 starttime = time_uptime; 3041 3042 /* 3043 * Push files whose dirty time has expired. Be careful 3044 * of interrupt race on slp queue. 3045 * 3046 * Skip over empty worklist slots when shutting down. 3047 */ 3048 do { 3049 slp = &syncer_workitem_pending[syncer_delayno]; 3050 syncer_delayno += 1; 3051 if (syncer_delayno == syncer_maxdelay) 3052 syncer_delayno = 0; 3053 next = &syncer_workitem_pending[syncer_delayno]; 3054 /* 3055 * If the worklist has wrapped since the 3056 * it was emptied of all but syncer vnodes, 3057 * switch to the FINAL_DELAY state and run 3058 * for one more second. 3059 */ 3060 if (syncer_state == SYNCER_SHUTTING_DOWN && 3061 net_worklist_len == 0 && 3062 last_work_seen == syncer_delayno) { 3063 syncer_state = SYNCER_FINAL_DELAY; 3064 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 3065 } 3066 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 3067 syncer_worklist_len > 0); 3068 3069 /* 3070 * Keep track of the last time there was anything 3071 * on the worklist other than syncer vnodes. 3072 * Return to the SHUTTING_DOWN state if any 3073 * new work appears. 3074 */ 3075 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 3076 last_work_seen = syncer_delayno; 3077 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 3078 syncer_state = SYNCER_SHUTTING_DOWN; 3079 while (!LIST_EMPTY(slp)) { 3080 error = sync_vnode(slp, &bo, td); 3081 if (error == 1) { 3082 LIST_REMOVE(bo, bo_synclist); 3083 LIST_INSERT_HEAD(next, bo, bo_synclist); 3084 continue; 3085 } 3086 3087 if (first_printf == 0) { 3088 /* 3089 * Drop the sync mutex, because some watchdog 3090 * drivers need to sleep while patting 3091 */ 3092 mtx_unlock(&sync_mtx); 3093 wdog_kern_pat(WD_LASTVAL); 3094 mtx_lock(&sync_mtx); 3095 } 3096 } 3097 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 3098 syncer_final_iter--; 3099 /* 3100 * The variable rushjob allows the kernel to speed up the 3101 * processing of the filesystem syncer process. A rushjob 3102 * value of N tells the filesystem syncer to process the next 3103 * N seconds worth of work on its queue ASAP. Currently rushjob 3104 * is used by the soft update code to speed up the filesystem 3105 * syncer process when the incore state is getting so far 3106 * ahead of the disk that the kernel memory pool is being 3107 * threatened with exhaustion. 3108 */ 3109 if (rushjob > 0) { 3110 rushjob -= 1; 3111 continue; 3112 } 3113 /* 3114 * Just sleep for a short period of time between 3115 * iterations when shutting down to allow some I/O 3116 * to happen. 3117 * 3118 * If it has taken us less than a second to process the 3119 * current work, then wait. Otherwise start right over 3120 * again. We can still lose time if any single round 3121 * takes more than two seconds, but it does not really 3122 * matter as we are just trying to generally pace the 3123 * filesystem activity. 3124 */ 3125 if (syncer_state != SYNCER_RUNNING || 3126 time_uptime == starttime) { 3127 thread_lock(td); 3128 sched_prio(td, PPAUSE); 3129 thread_unlock(td); 3130 } 3131 if (syncer_state != SYNCER_RUNNING) 3132 cv_timedwait(&sync_wakeup, &sync_mtx, 3133 hz / SYNCER_SHUTDOWN_SPEEDUP); 3134 else if (time_uptime == starttime) 3135 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 3136 } 3137 } 3138 3139 /* 3140 * Request the syncer daemon to speed up its work. 3141 * We never push it to speed up more than half of its 3142 * normal turn time, otherwise it could take over the cpu. 3143 */ 3144 int 3145 speedup_syncer(void) 3146 { 3147 int ret = 0; 3148 3149 mtx_lock(&sync_mtx); 3150 if (rushjob < syncdelay / 2) { 3151 rushjob += 1; 3152 stat_rush_requests += 1; 3153 ret = 1; 3154 } 3155 mtx_unlock(&sync_mtx); 3156 cv_broadcast(&sync_wakeup); 3157 return (ret); 3158 } 3159 3160 /* 3161 * Tell the syncer to speed up its work and run though its work 3162 * list several times, then tell it to shut down. 3163 */ 3164 static void 3165 syncer_shutdown(void *arg, int howto) 3166 { 3167 3168 if (howto & RB_NOSYNC) 3169 return; 3170 mtx_lock(&sync_mtx); 3171 syncer_state = SYNCER_SHUTTING_DOWN; 3172 rushjob = 0; 3173 mtx_unlock(&sync_mtx); 3174 cv_broadcast(&sync_wakeup); 3175 kproc_shutdown(arg, howto); 3176 } 3177 3178 void 3179 syncer_suspend(void) 3180 { 3181 3182 syncer_shutdown(updateproc, 0); 3183 } 3184 3185 void 3186 syncer_resume(void) 3187 { 3188 3189 mtx_lock(&sync_mtx); 3190 first_printf = 1; 3191 syncer_state = SYNCER_RUNNING; 3192 mtx_unlock(&sync_mtx); 3193 cv_broadcast(&sync_wakeup); 3194 kproc_resume(updateproc); 3195 } 3196 3197 /* 3198 * Move the buffer between the clean and dirty lists of its vnode. 3199 */ 3200 void 3201 reassignbuf(struct buf *bp) 3202 { 3203 struct vnode *vp; 3204 struct bufobj *bo; 3205 int delay; 3206 #ifdef INVARIANTS 3207 struct bufv *bv; 3208 #endif 3209 3210 vp = bp->b_vp; 3211 bo = bp->b_bufobj; 3212 3213 KASSERT((bp->b_flags & B_PAGING) == 0, 3214 ("%s: cannot reassign paging buffer %p", __func__, bp)); 3215 3216 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 3217 bp, bp->b_vp, bp->b_flags); 3218 3219 BO_LOCK(bo); 3220 if ((bo->bo_flag & BO_NONSTERILE) == 0) { 3221 /* 3222 * Coordinate with getblk's unlocked lookup. Make 3223 * BO_NONSTERILE visible before the first reassignbuf produces 3224 * any side effect. This could be outside the bo lock if we 3225 * used a separate atomic flag field. 3226 */ 3227 bo->bo_flag |= BO_NONSTERILE; 3228 atomic_thread_fence_rel(); 3229 } 3230 buf_vlist_remove(bp); 3231 3232 /* 3233 * If dirty, put on list of dirty buffers; otherwise insert onto list 3234 * of clean buffers. 3235 */ 3236 if (bp->b_flags & B_DELWRI) { 3237 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 3238 switch (vp->v_type) { 3239 case VDIR: 3240 delay = dirdelay; 3241 break; 3242 case VCHR: 3243 delay = metadelay; 3244 break; 3245 default: 3246 delay = filedelay; 3247 } 3248 vn_syncer_add_to_worklist(bo, delay); 3249 } 3250 buf_vlist_add(bp, bo, BX_VNDIRTY); 3251 } else { 3252 buf_vlist_add(bp, bo, BX_VNCLEAN); 3253 3254 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 3255 mtx_lock(&sync_mtx); 3256 LIST_REMOVE(bo, bo_synclist); 3257 syncer_worklist_len--; 3258 mtx_unlock(&sync_mtx); 3259 bo->bo_flag &= ~BO_ONWORKLST; 3260 } 3261 } 3262 #ifdef INVARIANTS 3263 bv = &bo->bo_clean; 3264 bp = TAILQ_FIRST(&bv->bv_hd); 3265 KASSERT(bp == NULL || bp->b_bufobj == bo, 3266 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3267 bp = TAILQ_LAST(&bv->bv_hd, buflists); 3268 KASSERT(bp == NULL || bp->b_bufobj == bo, 3269 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3270 bv = &bo->bo_dirty; 3271 bp = TAILQ_FIRST(&bv->bv_hd); 3272 KASSERT(bp == NULL || bp->b_bufobj == bo, 3273 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3274 bp = TAILQ_LAST(&bv->bv_hd, buflists); 3275 KASSERT(bp == NULL || bp->b_bufobj == bo, 3276 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3277 #endif 3278 BO_UNLOCK(bo); 3279 } 3280 3281 static void 3282 v_init_counters(struct vnode *vp) 3283 { 3284 3285 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 3286 vp, ("%s called for an initialized vnode", __FUNCTION__)); 3287 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 3288 3289 refcount_init(&vp->v_holdcnt, 1); 3290 refcount_init(&vp->v_usecount, 1); 3291 } 3292 3293 /* 3294 * Get a usecount on a vnode. 3295 * 3296 * vget and vget_finish may fail to lock the vnode if they lose a race against 3297 * it being doomed. LK_RETRY can be passed in flags to lock it anyway. 3298 * 3299 * Consumers which don't guarantee liveness of the vnode can use SMR to 3300 * try to get a reference. Note this operation can fail since the vnode 3301 * may be awaiting getting freed by the time they get to it. 3302 */ 3303 enum vgetstate 3304 vget_prep_smr(struct vnode *vp) 3305 { 3306 enum vgetstate vs; 3307 3308 VFS_SMR_ASSERT_ENTERED(); 3309 3310 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3311 vs = VGET_USECOUNT; 3312 } else { 3313 if (vhold_smr(vp)) 3314 vs = VGET_HOLDCNT; 3315 else 3316 vs = VGET_NONE; 3317 } 3318 return (vs); 3319 } 3320 3321 enum vgetstate 3322 vget_prep(struct vnode *vp) 3323 { 3324 enum vgetstate vs; 3325 3326 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3327 vs = VGET_USECOUNT; 3328 } else { 3329 vhold(vp); 3330 vs = VGET_HOLDCNT; 3331 } 3332 return (vs); 3333 } 3334 3335 void 3336 vget_abort(struct vnode *vp, enum vgetstate vs) 3337 { 3338 3339 switch (vs) { 3340 case VGET_USECOUNT: 3341 vrele(vp); 3342 break; 3343 case VGET_HOLDCNT: 3344 vdrop(vp); 3345 break; 3346 default: 3347 __assert_unreachable(); 3348 } 3349 } 3350 3351 int 3352 vget(struct vnode *vp, int flags) 3353 { 3354 enum vgetstate vs; 3355 3356 vs = vget_prep(vp); 3357 return (vget_finish(vp, flags, vs)); 3358 } 3359 3360 int 3361 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 3362 { 3363 int error; 3364 3365 if ((flags & LK_INTERLOCK) != 0) 3366 ASSERT_VI_LOCKED(vp, __func__); 3367 else 3368 ASSERT_VI_UNLOCKED(vp, __func__); 3369 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3370 VNPASS(vp->v_holdcnt > 0, vp); 3371 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3372 3373 error = vn_lock(vp, flags); 3374 if (__predict_false(error != 0)) { 3375 vget_abort(vp, vs); 3376 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 3377 vp); 3378 return (error); 3379 } 3380 3381 vget_finish_ref(vp, vs); 3382 return (0); 3383 } 3384 3385 void 3386 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 3387 { 3388 int old; 3389 3390 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3391 VNPASS(vp->v_holdcnt > 0, vp); 3392 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3393 3394 if (vs == VGET_USECOUNT) 3395 return; 3396 3397 /* 3398 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3399 * the vnode around. Otherwise someone else lended their hold count and 3400 * we have to drop ours. 3401 */ 3402 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3403 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3404 if (old != 0) { 3405 #ifdef INVARIANTS 3406 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3407 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3408 #else 3409 refcount_release(&vp->v_holdcnt); 3410 #endif 3411 } 3412 } 3413 3414 void 3415 vref(struct vnode *vp) 3416 { 3417 enum vgetstate vs; 3418 3419 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3420 vs = vget_prep(vp); 3421 vget_finish_ref(vp, vs); 3422 } 3423 3424 void 3425 vrefact(struct vnode *vp) 3426 { 3427 int old __diagused; 3428 3429 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3430 old = refcount_acquire(&vp->v_usecount); 3431 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3432 } 3433 3434 void 3435 vlazy(struct vnode *vp) 3436 { 3437 struct mount *mp; 3438 3439 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3440 3441 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3442 return; 3443 /* 3444 * We may get here for inactive routines after the vnode got doomed. 3445 */ 3446 if (VN_IS_DOOMED(vp)) 3447 return; 3448 mp = vp->v_mount; 3449 mtx_lock(&mp->mnt_listmtx); 3450 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3451 vp->v_mflag |= VMP_LAZYLIST; 3452 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3453 mp->mnt_lazyvnodelistsize++; 3454 } 3455 mtx_unlock(&mp->mnt_listmtx); 3456 } 3457 3458 static void 3459 vunlazy(struct vnode *vp) 3460 { 3461 struct mount *mp; 3462 3463 ASSERT_VI_LOCKED(vp, __func__); 3464 VNPASS(!VN_IS_DOOMED(vp), vp); 3465 3466 mp = vp->v_mount; 3467 mtx_lock(&mp->mnt_listmtx); 3468 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3469 /* 3470 * Don't remove the vnode from the lazy list if another thread 3471 * has increased the hold count. It may have re-enqueued the 3472 * vnode to the lazy list and is now responsible for its 3473 * removal. 3474 */ 3475 if (vp->v_holdcnt == 0) { 3476 vp->v_mflag &= ~VMP_LAZYLIST; 3477 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3478 mp->mnt_lazyvnodelistsize--; 3479 } 3480 mtx_unlock(&mp->mnt_listmtx); 3481 } 3482 3483 /* 3484 * This routine is only meant to be called from vgonel prior to dooming 3485 * the vnode. 3486 */ 3487 static void 3488 vunlazy_gone(struct vnode *vp) 3489 { 3490 struct mount *mp; 3491 3492 ASSERT_VOP_ELOCKED(vp, __func__); 3493 ASSERT_VI_LOCKED(vp, __func__); 3494 VNPASS(!VN_IS_DOOMED(vp), vp); 3495 3496 if (vp->v_mflag & VMP_LAZYLIST) { 3497 mp = vp->v_mount; 3498 mtx_lock(&mp->mnt_listmtx); 3499 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3500 vp->v_mflag &= ~VMP_LAZYLIST; 3501 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3502 mp->mnt_lazyvnodelistsize--; 3503 mtx_unlock(&mp->mnt_listmtx); 3504 } 3505 } 3506 3507 static void 3508 vdefer_inactive(struct vnode *vp) 3509 { 3510 3511 ASSERT_VI_LOCKED(vp, __func__); 3512 VNPASS(vp->v_holdcnt > 0, vp); 3513 if (VN_IS_DOOMED(vp)) { 3514 vdropl(vp); 3515 return; 3516 } 3517 if (vp->v_iflag & VI_DEFINACT) { 3518 VNPASS(vp->v_holdcnt > 1, vp); 3519 vdropl(vp); 3520 return; 3521 } 3522 if (vp->v_usecount > 0) { 3523 vp->v_iflag &= ~VI_OWEINACT; 3524 vdropl(vp); 3525 return; 3526 } 3527 vlazy(vp); 3528 vp->v_iflag |= VI_DEFINACT; 3529 VI_UNLOCK(vp); 3530 atomic_add_long(&deferred_inact, 1); 3531 } 3532 3533 static void 3534 vdefer_inactive_unlocked(struct vnode *vp) 3535 { 3536 3537 VI_LOCK(vp); 3538 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3539 vdropl(vp); 3540 return; 3541 } 3542 vdefer_inactive(vp); 3543 } 3544 3545 enum vput_op { VRELE, VPUT, VUNREF }; 3546 3547 /* 3548 * Handle ->v_usecount transitioning to 0. 3549 * 3550 * By releasing the last usecount we take ownership of the hold count which 3551 * provides liveness of the vnode, meaning we have to vdrop. 3552 * 3553 * For all vnodes we may need to perform inactive processing. It requires an 3554 * exclusive lock on the vnode, while it is legal to call here with only a 3555 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3556 * inactive processing gets deferred to the syncer. 3557 * 3558 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3559 * on the lock being held all the way until VOP_INACTIVE. This in particular 3560 * happens with UFS which adds half-constructed vnodes to the hash, where they 3561 * can be found by other code. 3562 */ 3563 static void 3564 vput_final(struct vnode *vp, enum vput_op func) 3565 { 3566 int error; 3567 bool want_unlock; 3568 3569 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3570 VNPASS(vp->v_holdcnt > 0, vp); 3571 3572 VI_LOCK(vp); 3573 3574 /* 3575 * By the time we got here someone else might have transitioned 3576 * the count back to > 0. 3577 */ 3578 if (vp->v_usecount > 0) 3579 goto out; 3580 3581 /* 3582 * If the vnode is doomed vgone already performed inactive processing 3583 * (if needed). 3584 */ 3585 if (VN_IS_DOOMED(vp)) 3586 goto out; 3587 3588 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3589 goto out; 3590 3591 if (vp->v_iflag & VI_DOINGINACT) 3592 goto out; 3593 3594 /* 3595 * Locking operations here will drop the interlock and possibly the 3596 * vnode lock, opening a window where the vnode can get doomed all the 3597 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3598 * perform inactive. 3599 */ 3600 vp->v_iflag |= VI_OWEINACT; 3601 want_unlock = false; 3602 error = 0; 3603 switch (func) { 3604 case VRELE: 3605 switch (VOP_ISLOCKED(vp)) { 3606 case LK_EXCLUSIVE: 3607 break; 3608 case LK_EXCLOTHER: 3609 case 0: 3610 want_unlock = true; 3611 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3612 VI_LOCK(vp); 3613 break; 3614 default: 3615 /* 3616 * The lock has at least one sharer, but we have no way 3617 * to conclude whether this is us. Play it safe and 3618 * defer processing. 3619 */ 3620 error = EAGAIN; 3621 break; 3622 } 3623 break; 3624 case VPUT: 3625 want_unlock = true; 3626 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3627 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3628 LK_NOWAIT); 3629 VI_LOCK(vp); 3630 } 3631 break; 3632 case VUNREF: 3633 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3634 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3635 VI_LOCK(vp); 3636 } 3637 break; 3638 } 3639 if (error == 0) { 3640 if (func == VUNREF) { 3641 VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, 3642 ("recursive vunref")); 3643 vp->v_vflag |= VV_UNREF; 3644 } 3645 for (;;) { 3646 error = vinactive(vp); 3647 if (want_unlock) 3648 VOP_UNLOCK(vp); 3649 if (error != ERELOOKUP || !want_unlock) 3650 break; 3651 VOP_LOCK(vp, LK_EXCLUSIVE); 3652 } 3653 if (func == VUNREF) 3654 vp->v_vflag &= ~VV_UNREF; 3655 vdropl(vp); 3656 } else { 3657 vdefer_inactive(vp); 3658 } 3659 return; 3660 out: 3661 if (func == VPUT) 3662 VOP_UNLOCK(vp); 3663 vdropl(vp); 3664 } 3665 3666 /* 3667 * Decrement ->v_usecount for a vnode. 3668 * 3669 * Releasing the last use count requires additional processing, see vput_final 3670 * above for details. 3671 * 3672 * Comment above each variant denotes lock state on entry and exit. 3673 */ 3674 3675 /* 3676 * in: any 3677 * out: same as passed in 3678 */ 3679 void 3680 vrele(struct vnode *vp) 3681 { 3682 3683 ASSERT_VI_UNLOCKED(vp, __func__); 3684 if (!refcount_release(&vp->v_usecount)) 3685 return; 3686 vput_final(vp, VRELE); 3687 } 3688 3689 /* 3690 * in: locked 3691 * out: unlocked 3692 */ 3693 void 3694 vput(struct vnode *vp) 3695 { 3696 3697 ASSERT_VOP_LOCKED(vp, __func__); 3698 ASSERT_VI_UNLOCKED(vp, __func__); 3699 if (!refcount_release(&vp->v_usecount)) { 3700 VOP_UNLOCK(vp); 3701 return; 3702 } 3703 vput_final(vp, VPUT); 3704 } 3705 3706 /* 3707 * in: locked 3708 * out: locked 3709 */ 3710 void 3711 vunref(struct vnode *vp) 3712 { 3713 3714 ASSERT_VOP_LOCKED(vp, __func__); 3715 ASSERT_VI_UNLOCKED(vp, __func__); 3716 if (!refcount_release(&vp->v_usecount)) 3717 return; 3718 vput_final(vp, VUNREF); 3719 } 3720 3721 void 3722 vhold(struct vnode *vp) 3723 { 3724 int old; 3725 3726 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3727 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3728 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3729 ("%s: wrong hold count %d", __func__, old)); 3730 if (old == 0) 3731 vfs_freevnodes_dec(); 3732 } 3733 3734 void 3735 vholdnz(struct vnode *vp) 3736 { 3737 3738 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3739 #ifdef INVARIANTS 3740 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3741 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3742 ("%s: wrong hold count %d", __func__, old)); 3743 #else 3744 atomic_add_int(&vp->v_holdcnt, 1); 3745 #endif 3746 } 3747 3748 /* 3749 * Grab a hold count unless the vnode is freed. 3750 * 3751 * Only use this routine if vfs smr is the only protection you have against 3752 * freeing the vnode. 3753 * 3754 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3755 * is not set. After the flag is set the vnode becomes immutable to anyone but 3756 * the thread which managed to set the flag. 3757 * 3758 * It may be tempting to replace the loop with: 3759 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3760 * if (count & VHOLD_NO_SMR) { 3761 * backpedal and error out; 3762 * } 3763 * 3764 * However, while this is more performant, it hinders debugging by eliminating 3765 * the previously mentioned invariant. 3766 */ 3767 bool 3768 vhold_smr(struct vnode *vp) 3769 { 3770 int count; 3771 3772 VFS_SMR_ASSERT_ENTERED(); 3773 3774 count = atomic_load_int(&vp->v_holdcnt); 3775 for (;;) { 3776 if (count & VHOLD_NO_SMR) { 3777 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3778 ("non-zero hold count with flags %d\n", count)); 3779 return (false); 3780 } 3781 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3782 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3783 if (count == 0) 3784 vfs_freevnodes_dec(); 3785 return (true); 3786 } 3787 } 3788 } 3789 3790 /* 3791 * Hold a free vnode for recycling. 3792 * 3793 * Note: vnode_init references this comment. 3794 * 3795 * Attempts to recycle only need the global vnode list lock and have no use for 3796 * SMR. 3797 * 3798 * However, vnodes get inserted into the global list before they get fully 3799 * initialized and stay there until UMA decides to free the memory. This in 3800 * particular means the target can be found before it becomes usable and after 3801 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to 3802 * VHOLD_NO_SMR. 3803 * 3804 * Note: the vnode may gain more references after we transition the count 0->1. 3805 */ 3806 static bool 3807 vhold_recycle_free(struct vnode *vp) 3808 { 3809 int count; 3810 3811 mtx_assert(&vnode_list_mtx, MA_OWNED); 3812 3813 count = atomic_load_int(&vp->v_holdcnt); 3814 for (;;) { 3815 if (count & VHOLD_NO_SMR) { 3816 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3817 ("non-zero hold count with flags %d\n", count)); 3818 return (false); 3819 } 3820 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3821 if (count > 0) { 3822 return (false); 3823 } 3824 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3825 vfs_freevnodes_dec(); 3826 return (true); 3827 } 3828 } 3829 } 3830 3831 static void __noinline 3832 vdbatch_process(struct vdbatch *vd) 3833 { 3834 struct vnode *vp; 3835 int i; 3836 3837 mtx_assert(&vd->lock, MA_OWNED); 3838 MPASS(curthread->td_pinned > 0); 3839 MPASS(vd->index == VDBATCH_SIZE); 3840 3841 /* 3842 * Attempt to requeue the passed batch, but give up easily. 3843 * 3844 * Despite batching the mechanism is prone to transient *significant* 3845 * lock contention, where vnode_list_mtx becomes the primary bottleneck 3846 * if multiple CPUs get here (one real-world example is highly parallel 3847 * do-nothing make , which will stat *tons* of vnodes). Since it is 3848 * quasi-LRU (read: not that great even if fully honoured) provide an 3849 * option to just dodge the problem. Parties which don't like it are 3850 * welcome to implement something better. 3851 */ 3852 if (vnode_can_skip_requeue) { 3853 if (!mtx_trylock(&vnode_list_mtx)) { 3854 counter_u64_add(vnode_skipped_requeues, 1); 3855 critical_enter(); 3856 for (i = 0; i < VDBATCH_SIZE; i++) { 3857 vp = vd->tab[i]; 3858 vd->tab[i] = NULL; 3859 MPASS(vp->v_dbatchcpu != NOCPU); 3860 vp->v_dbatchcpu = NOCPU; 3861 } 3862 vd->index = 0; 3863 critical_exit(); 3864 return; 3865 3866 } 3867 /* fallthrough to locked processing */ 3868 } else { 3869 mtx_lock(&vnode_list_mtx); 3870 } 3871 3872 mtx_assert(&vnode_list_mtx, MA_OWNED); 3873 critical_enter(); 3874 for (i = 0; i < VDBATCH_SIZE; i++) { 3875 vp = vd->tab[i]; 3876 vd->tab[i] = NULL; 3877 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3878 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3879 MPASS(vp->v_dbatchcpu != NOCPU); 3880 vp->v_dbatchcpu = NOCPU; 3881 } 3882 mtx_unlock(&vnode_list_mtx); 3883 vd->index = 0; 3884 critical_exit(); 3885 } 3886 3887 static void 3888 vdbatch_enqueue(struct vnode *vp) 3889 { 3890 struct vdbatch *vd; 3891 3892 ASSERT_VI_LOCKED(vp, __func__); 3893 VNPASS(!VN_IS_DOOMED(vp), vp); 3894 3895 if (vp->v_dbatchcpu != NOCPU) { 3896 VI_UNLOCK(vp); 3897 return; 3898 } 3899 3900 sched_pin(); 3901 vd = DPCPU_PTR(vd); 3902 mtx_lock(&vd->lock); 3903 MPASS(vd->index < VDBATCH_SIZE); 3904 MPASS(vd->tab[vd->index] == NULL); 3905 /* 3906 * A hack: we depend on being pinned so that we know what to put in 3907 * ->v_dbatchcpu. 3908 */ 3909 vp->v_dbatchcpu = curcpu; 3910 vd->tab[vd->index] = vp; 3911 vd->index++; 3912 VI_UNLOCK(vp); 3913 if (vd->index == VDBATCH_SIZE) 3914 vdbatch_process(vd); 3915 mtx_unlock(&vd->lock); 3916 sched_unpin(); 3917 } 3918 3919 /* 3920 * This routine must only be called for vnodes which are about to be 3921 * deallocated. Supporting dequeue for arbitrary vndoes would require 3922 * validating that the locked batch matches. 3923 */ 3924 static void 3925 vdbatch_dequeue(struct vnode *vp) 3926 { 3927 struct vdbatch *vd; 3928 int i; 3929 short cpu; 3930 3931 VNPASS(vp->v_type == VBAD || vp->v_type == VNON, vp); 3932 3933 cpu = vp->v_dbatchcpu; 3934 if (cpu == NOCPU) 3935 return; 3936 3937 vd = DPCPU_ID_PTR(cpu, vd); 3938 mtx_lock(&vd->lock); 3939 for (i = 0; i < vd->index; i++) { 3940 if (vd->tab[i] != vp) 3941 continue; 3942 vp->v_dbatchcpu = NOCPU; 3943 vd->index--; 3944 vd->tab[i] = vd->tab[vd->index]; 3945 vd->tab[vd->index] = NULL; 3946 break; 3947 } 3948 mtx_unlock(&vd->lock); 3949 /* 3950 * Either we dequeued the vnode above or the target CPU beat us to it. 3951 */ 3952 MPASS(vp->v_dbatchcpu == NOCPU); 3953 } 3954 3955 /* 3956 * Drop the hold count of the vnode. 3957 * 3958 * It will only get freed if this is the last hold *and* it has been vgone'd. 3959 * 3960 * Because the vnode vm object keeps a hold reference on the vnode if 3961 * there is at least one resident non-cached page, the vnode cannot 3962 * leave the active list without the page cleanup done. 3963 */ 3964 static void __noinline 3965 vdropl_final(struct vnode *vp) 3966 { 3967 3968 ASSERT_VI_LOCKED(vp, __func__); 3969 VNPASS(VN_IS_DOOMED(vp), vp); 3970 /* 3971 * Set the VHOLD_NO_SMR flag. 3972 * 3973 * We may be racing against vhold_smr. If they win we can just pretend 3974 * we never got this far, they will vdrop later. 3975 */ 3976 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { 3977 vfs_freevnodes_inc(); 3978 VI_UNLOCK(vp); 3979 /* 3980 * We lost the aforementioned race. Any subsequent access is 3981 * invalid as they might have managed to vdropl on their own. 3982 */ 3983 return; 3984 } 3985 /* 3986 * Don't bump freevnodes as this one is going away. 3987 */ 3988 freevnode(vp); 3989 } 3990 3991 void 3992 vdrop(struct vnode *vp) 3993 { 3994 3995 ASSERT_VI_UNLOCKED(vp, __func__); 3996 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3997 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3998 return; 3999 VI_LOCK(vp); 4000 vdropl(vp); 4001 } 4002 4003 static __always_inline void 4004 vdropl_impl(struct vnode *vp, bool enqueue) 4005 { 4006 4007 ASSERT_VI_LOCKED(vp, __func__); 4008 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4009 if (!refcount_release(&vp->v_holdcnt)) { 4010 VI_UNLOCK(vp); 4011 return; 4012 } 4013 VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); 4014 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 4015 if (VN_IS_DOOMED(vp)) { 4016 vdropl_final(vp); 4017 return; 4018 } 4019 4020 vfs_freevnodes_inc(); 4021 if (vp->v_mflag & VMP_LAZYLIST) { 4022 vunlazy(vp); 4023 } 4024 4025 if (!enqueue) { 4026 VI_UNLOCK(vp); 4027 return; 4028 } 4029 4030 /* 4031 * Also unlocks the interlock. We can't assert on it as we 4032 * released our hold and by now the vnode might have been 4033 * freed. 4034 */ 4035 vdbatch_enqueue(vp); 4036 } 4037 4038 void 4039 vdropl(struct vnode *vp) 4040 { 4041 4042 vdropl_impl(vp, true); 4043 } 4044 4045 /* 4046 * vdrop a vnode when recycling 4047 * 4048 * This is a special case routine only to be used when recycling, differs from 4049 * regular vdrop by not requeieing the vnode on LRU. 4050 * 4051 * Consider a case where vtryrecycle continuously fails with all vnodes (due to 4052 * e.g., frozen writes on the filesystem), filling the batch and causing it to 4053 * be requeued. Then vnlru will end up revisiting the same vnodes. This is a 4054 * loop which can last for as long as writes are frozen. 4055 */ 4056 static void 4057 vdropl_recycle(struct vnode *vp) 4058 { 4059 4060 vdropl_impl(vp, false); 4061 } 4062 4063 static void 4064 vdrop_recycle(struct vnode *vp) 4065 { 4066 4067 VI_LOCK(vp); 4068 vdropl_recycle(vp); 4069 } 4070 4071 /* 4072 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 4073 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 4074 */ 4075 static int 4076 vinactivef(struct vnode *vp) 4077 { 4078 int error; 4079 4080 ASSERT_VOP_ELOCKED(vp, "vinactive"); 4081 ASSERT_VI_LOCKED(vp, "vinactive"); 4082 VNPASS((vp->v_iflag & VI_DOINGINACT) == 0, vp); 4083 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4084 vp->v_iflag |= VI_DOINGINACT; 4085 vp->v_iflag &= ~VI_OWEINACT; 4086 VI_UNLOCK(vp); 4087 4088 /* 4089 * Before moving off the active list, we must be sure that any 4090 * modified pages are converted into the vnode's dirty 4091 * buffers, since these will no longer be checked once the 4092 * vnode is on the inactive list. 4093 * 4094 * The write-out of the dirty pages is asynchronous. At the 4095 * point that VOP_INACTIVE() is called, there could still be 4096 * pending I/O and dirty pages in the object. 4097 */ 4098 if ((vp->v_vflag & VV_NOSYNC) == 0) 4099 vnode_pager_clean_async(vp); 4100 4101 error = VOP_INACTIVE(vp); 4102 VI_LOCK(vp); 4103 VNPASS(vp->v_iflag & VI_DOINGINACT, vp); 4104 vp->v_iflag &= ~VI_DOINGINACT; 4105 return (error); 4106 } 4107 4108 int 4109 vinactive(struct vnode *vp) 4110 { 4111 4112 ASSERT_VOP_ELOCKED(vp, "vinactive"); 4113 ASSERT_VI_LOCKED(vp, "vinactive"); 4114 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4115 4116 if ((vp->v_iflag & VI_OWEINACT) == 0) 4117 return (0); 4118 if (vp->v_iflag & VI_DOINGINACT) 4119 return (0); 4120 if (vp->v_usecount > 0) { 4121 vp->v_iflag &= ~VI_OWEINACT; 4122 return (0); 4123 } 4124 return (vinactivef(vp)); 4125 } 4126 4127 /* 4128 * Remove any vnodes in the vnode table belonging to mount point mp. 4129 * 4130 * If FORCECLOSE is not specified, there should not be any active ones, 4131 * return error if any are found (nb: this is a user error, not a 4132 * system error). If FORCECLOSE is specified, detach any active vnodes 4133 * that are found. 4134 * 4135 * If WRITECLOSE is set, only flush out regular file vnodes open for 4136 * writing. 4137 * 4138 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 4139 * 4140 * `rootrefs' specifies the base reference count for the root vnode 4141 * of this filesystem. The root vnode is considered busy if its 4142 * v_usecount exceeds this value. On a successful return, vflush(, td) 4143 * will call vrele() on the root vnode exactly rootrefs times. 4144 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 4145 * be zero. 4146 */ 4147 #ifdef DIAGNOSTIC 4148 static int busyprt = 0; /* print out busy vnodes */ 4149 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 4150 #endif 4151 4152 int 4153 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 4154 { 4155 struct vnode *vp, *mvp, *rootvp = NULL; 4156 struct vattr vattr; 4157 int busy = 0, error; 4158 4159 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 4160 rootrefs, flags); 4161 if (rootrefs > 0) { 4162 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 4163 ("vflush: bad args")); 4164 /* 4165 * Get the filesystem root vnode. We can vput() it 4166 * immediately, since with rootrefs > 0, it won't go away. 4167 */ 4168 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 4169 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 4170 __func__, error); 4171 return (error); 4172 } 4173 vput(rootvp); 4174 } 4175 loop: 4176 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 4177 vholdl(vp); 4178 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 4179 if (error) { 4180 vdrop(vp); 4181 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 4182 goto loop; 4183 } 4184 /* 4185 * Skip over a vnodes marked VV_SYSTEM. 4186 */ 4187 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 4188 VOP_UNLOCK(vp); 4189 vdrop(vp); 4190 continue; 4191 } 4192 /* 4193 * If WRITECLOSE is set, flush out unlinked but still open 4194 * files (even if open only for reading) and regular file 4195 * vnodes open for writing. 4196 */ 4197 if (flags & WRITECLOSE) { 4198 vnode_pager_clean_async(vp); 4199 do { 4200 error = VOP_FSYNC(vp, MNT_WAIT, td); 4201 } while (error == ERELOOKUP); 4202 if (error != 0) { 4203 VOP_UNLOCK(vp); 4204 vdrop(vp); 4205 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 4206 return (error); 4207 } 4208 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 4209 VI_LOCK(vp); 4210 4211 if ((vp->v_type == VNON || 4212 (error == 0 && vattr.va_nlink > 0)) && 4213 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 4214 VOP_UNLOCK(vp); 4215 vdropl(vp); 4216 continue; 4217 } 4218 } else 4219 VI_LOCK(vp); 4220 /* 4221 * With v_usecount == 0, all we need to do is clear out the 4222 * vnode data structures and we are done. 4223 * 4224 * If FORCECLOSE is set, forcibly close the vnode. 4225 */ 4226 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 4227 vgonel(vp); 4228 } else { 4229 busy++; 4230 #ifdef DIAGNOSTIC 4231 if (busyprt) 4232 vn_printf(vp, "vflush: busy vnode "); 4233 #endif 4234 } 4235 VOP_UNLOCK(vp); 4236 vdropl(vp); 4237 } 4238 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 4239 /* 4240 * If just the root vnode is busy, and if its refcount 4241 * is equal to `rootrefs', then go ahead and kill it. 4242 */ 4243 VI_LOCK(rootvp); 4244 KASSERT(busy > 0, ("vflush: not busy")); 4245 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 4246 ("vflush: usecount %d < rootrefs %d", 4247 rootvp->v_usecount, rootrefs)); 4248 if (busy == 1 && rootvp->v_usecount == rootrefs) { 4249 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 4250 vgone(rootvp); 4251 VOP_UNLOCK(rootvp); 4252 busy = 0; 4253 } else 4254 VI_UNLOCK(rootvp); 4255 } 4256 if (busy) { 4257 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 4258 busy); 4259 return (EBUSY); 4260 } 4261 for (; rootrefs > 0; rootrefs--) 4262 vrele(rootvp); 4263 return (0); 4264 } 4265 4266 /* 4267 * Recycle an unused vnode. 4268 */ 4269 int 4270 vrecycle(struct vnode *vp) 4271 { 4272 int recycled; 4273 4274 VI_LOCK(vp); 4275 recycled = vrecyclel(vp); 4276 VI_UNLOCK(vp); 4277 return (recycled); 4278 } 4279 4280 /* 4281 * vrecycle, with the vp interlock held. 4282 */ 4283 int 4284 vrecyclel(struct vnode *vp) 4285 { 4286 int recycled; 4287 4288 ASSERT_VOP_ELOCKED(vp, __func__); 4289 ASSERT_VI_LOCKED(vp, __func__); 4290 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4291 recycled = 0; 4292 if (vp->v_usecount == 0) { 4293 recycled = 1; 4294 vgonel(vp); 4295 } 4296 return (recycled); 4297 } 4298 4299 /* 4300 * Eliminate all activity associated with a vnode 4301 * in preparation for reuse. 4302 */ 4303 void 4304 vgone(struct vnode *vp) 4305 { 4306 VI_LOCK(vp); 4307 vgonel(vp); 4308 VI_UNLOCK(vp); 4309 } 4310 4311 /* 4312 * Notify upper mounts about reclaimed or unlinked vnode. 4313 */ 4314 void 4315 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) 4316 { 4317 struct mount *mp; 4318 struct mount_upper_node *ump; 4319 4320 mp = atomic_load_ptr(&vp->v_mount); 4321 if (mp == NULL) 4322 return; 4323 if (TAILQ_EMPTY(&mp->mnt_notify)) 4324 return; 4325 4326 MNT_ILOCK(mp); 4327 mp->mnt_upper_pending++; 4328 KASSERT(mp->mnt_upper_pending > 0, 4329 ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); 4330 TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { 4331 MNT_IUNLOCK(mp); 4332 switch (event) { 4333 case VFS_NOTIFY_UPPER_RECLAIM: 4334 VFS_RECLAIM_LOWERVP(ump->mp, vp); 4335 break; 4336 case VFS_NOTIFY_UPPER_UNLINK: 4337 VFS_UNLINK_LOWERVP(ump->mp, vp); 4338 break; 4339 } 4340 MNT_ILOCK(mp); 4341 } 4342 mp->mnt_upper_pending--; 4343 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 4344 mp->mnt_upper_pending == 0) { 4345 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 4346 wakeup(&mp->mnt_uppers); 4347 } 4348 MNT_IUNLOCK(mp); 4349 } 4350 4351 /* 4352 * vgone, with the vp interlock held. 4353 */ 4354 static void 4355 vgonel(struct vnode *vp) 4356 { 4357 struct thread *td; 4358 struct mount *mp; 4359 vm_object_t object; 4360 bool active, doinginact, oweinact; 4361 4362 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4363 ASSERT_VI_LOCKED(vp, "vgonel"); 4364 VNASSERT(vp->v_holdcnt, vp, 4365 ("vgonel: vp %p has no reference.", vp)); 4366 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4367 td = curthread; 4368 4369 /* 4370 * Don't vgonel if we're already doomed. 4371 */ 4372 if (VN_IS_DOOMED(vp)) { 4373 VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \ 4374 vn_get_state(vp) == VSTATE_DEAD, vp); 4375 return; 4376 } 4377 /* 4378 * Paired with freevnode. 4379 */ 4380 vn_seqc_write_begin_locked(vp); 4381 vunlazy_gone(vp); 4382 vn_irflag_set_locked(vp, VIRF_DOOMED); 4383 vn_set_state(vp, VSTATE_DESTROYING); 4384 4385 /* 4386 * Check to see if the vnode is in use. If so, we have to 4387 * call VOP_CLOSE() and VOP_INACTIVE(). 4388 * 4389 * It could be that VOP_INACTIVE() requested reclamation, in 4390 * which case we should avoid recursion, so check 4391 * VI_DOINGINACT. This is not precise but good enough. 4392 */ 4393 active = vp->v_usecount > 0; 4394 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4395 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; 4396 4397 /* 4398 * If we need to do inactive VI_OWEINACT will be set. 4399 */ 4400 if (vp->v_iflag & VI_DEFINACT) { 4401 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4402 vp->v_iflag &= ~VI_DEFINACT; 4403 vdropl(vp); 4404 } else { 4405 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4406 VI_UNLOCK(vp); 4407 } 4408 cache_purge_vgone(vp); 4409 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4410 4411 /* 4412 * If purging an active vnode, it must be closed and 4413 * deactivated before being reclaimed. 4414 */ 4415 if (active) 4416 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4417 if (!doinginact) { 4418 do { 4419 if (oweinact || active) { 4420 VI_LOCK(vp); 4421 vinactivef(vp); 4422 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4423 VI_UNLOCK(vp); 4424 } 4425 } while (oweinact); 4426 } 4427 if (vp->v_type == VSOCK) 4428 vfs_unp_reclaim(vp); 4429 4430 /* 4431 * Clean out any buffers associated with the vnode. 4432 * If the flush fails, just toss the buffers. 4433 */ 4434 mp = NULL; 4435 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4436 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4437 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4438 while (vinvalbuf(vp, 0, 0, 0) != 0) 4439 ; 4440 } 4441 4442 BO_LOCK(&vp->v_bufobj); 4443 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4444 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4445 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4446 vp->v_bufobj.bo_clean.bv_cnt == 0, 4447 ("vp %p bufobj not invalidated", vp)); 4448 4449 /* 4450 * For VMIO bufobj, BO_DEAD is set later, or in 4451 * vm_object_terminate() after the object's page queue is 4452 * flushed. 4453 */ 4454 object = vp->v_bufobj.bo_object; 4455 if (object == NULL) 4456 vp->v_bufobj.bo_flag |= BO_DEAD; 4457 BO_UNLOCK(&vp->v_bufobj); 4458 4459 /* 4460 * Handle the VM part. Tmpfs handles v_object on its own (the 4461 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4462 * should not touch the object borrowed from the lower vnode 4463 * (the handle check). 4464 */ 4465 if (object != NULL && object->type == OBJT_VNODE && 4466 object->handle == vp) 4467 vnode_destroy_vobject(vp); 4468 4469 /* 4470 * Reclaim the vnode. 4471 */ 4472 if (VOP_RECLAIM(vp)) 4473 panic("vgone: cannot reclaim"); 4474 if (mp != NULL) 4475 vn_finished_secondary_write(mp); 4476 VNASSERT(vp->v_object == NULL, vp, 4477 ("vop_reclaim left v_object vp=%p", vp)); 4478 /* 4479 * Clear the advisory locks and wake up waiting threads. 4480 */ 4481 if (vp->v_lockf != NULL) { 4482 (void)VOP_ADVLOCKPURGE(vp); 4483 vp->v_lockf = NULL; 4484 } 4485 /* 4486 * Delete from old mount point vnode list. 4487 */ 4488 if (vp->v_mount == NULL) { 4489 VI_LOCK(vp); 4490 } else { 4491 delmntque(vp); 4492 ASSERT_VI_LOCKED(vp, "vgonel 2"); 4493 } 4494 /* 4495 * Done with purge, reset to the standard lock and invalidate 4496 * the vnode. 4497 */ 4498 vp->v_vnlock = &vp->v_lock; 4499 vp->v_op = &dead_vnodeops; 4500 vp->v_type = VBAD; 4501 vn_set_state(vp, VSTATE_DEAD); 4502 } 4503 4504 /* 4505 * Print out a description of a vnode. 4506 */ 4507 static const char *const vtypename[] = { 4508 [VNON] = "VNON", 4509 [VREG] = "VREG", 4510 [VDIR] = "VDIR", 4511 [VBLK] = "VBLK", 4512 [VCHR] = "VCHR", 4513 [VLNK] = "VLNK", 4514 [VSOCK] = "VSOCK", 4515 [VFIFO] = "VFIFO", 4516 [VBAD] = "VBAD", 4517 [VMARKER] = "VMARKER", 4518 }; 4519 _Static_assert(nitems(vtypename) == VLASTTYPE + 1, 4520 "vnode type name not added to vtypename"); 4521 4522 static const char *const vstatename[] = { 4523 [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED", 4524 [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED", 4525 [VSTATE_DESTROYING] = "VSTATE_DESTROYING", 4526 [VSTATE_DEAD] = "VSTATE_DEAD", 4527 }; 4528 _Static_assert(nitems(vstatename) == VLASTSTATE + 1, 4529 "vnode state name not added to vstatename"); 4530 4531 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4532 "new hold count flag not added to vn_printf"); 4533 4534 void 4535 vn_printf(struct vnode *vp, const char *fmt, ...) 4536 { 4537 va_list ap; 4538 char buf[256], buf2[16]; 4539 u_long flags; 4540 u_int holdcnt; 4541 short irflag; 4542 4543 va_start(ap, fmt); 4544 vprintf(fmt, ap); 4545 va_end(ap); 4546 printf("%p: ", (void *)vp); 4547 printf("type %s state %s op %p\n", vtypename[vp->v_type], 4548 vstatename[vp->v_state], vp->v_op); 4549 holdcnt = atomic_load_int(&vp->v_holdcnt); 4550 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4551 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4552 vp->v_seqc_users); 4553 switch (vp->v_type) { 4554 case VDIR: 4555 printf(" mountedhere %p\n", vp->v_mountedhere); 4556 break; 4557 case VCHR: 4558 printf(" rdev %p\n", vp->v_rdev); 4559 break; 4560 case VSOCK: 4561 printf(" socket %p\n", vp->v_unpcb); 4562 break; 4563 case VFIFO: 4564 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4565 break; 4566 default: 4567 printf("\n"); 4568 break; 4569 } 4570 buf[0] = '\0'; 4571 buf[1] = '\0'; 4572 if (holdcnt & VHOLD_NO_SMR) 4573 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4574 printf(" hold count flags (%s)\n", buf + 1); 4575 4576 buf[0] = '\0'; 4577 buf[1] = '\0'; 4578 irflag = vn_irflag_read(vp); 4579 if (irflag & VIRF_DOOMED) 4580 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4581 if (irflag & VIRF_PGREAD) 4582 strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); 4583 if (irflag & VIRF_MOUNTPOINT) 4584 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); 4585 if (irflag & VIRF_TEXT_REF) 4586 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); 4587 flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); 4588 if (flags != 0) { 4589 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4590 strlcat(buf, buf2, sizeof(buf)); 4591 } 4592 if (vp->v_vflag & VV_ROOT) 4593 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4594 if (vp->v_vflag & VV_ISTTY) 4595 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4596 if (vp->v_vflag & VV_NOSYNC) 4597 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4598 if (vp->v_vflag & VV_ETERNALDEV) 4599 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4600 if (vp->v_vflag & VV_CACHEDLABEL) 4601 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4602 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4603 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4604 if (vp->v_vflag & VV_COPYONWRITE) 4605 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4606 if (vp->v_vflag & VV_SYSTEM) 4607 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4608 if (vp->v_vflag & VV_PROCDEP) 4609 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4610 if (vp->v_vflag & VV_DELETED) 4611 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4612 if (vp->v_vflag & VV_MD) 4613 strlcat(buf, "|VV_MD", sizeof(buf)); 4614 if (vp->v_vflag & VV_FORCEINSMQ) 4615 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4616 if (vp->v_vflag & VV_READLINK) 4617 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4618 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4619 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | 4620 VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); 4621 if (flags != 0) { 4622 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4623 strlcat(buf, buf2, sizeof(buf)); 4624 } 4625 if (vp->v_iflag & VI_MOUNT) 4626 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4627 if (vp->v_iflag & VI_DOINGINACT) 4628 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4629 if (vp->v_iflag & VI_OWEINACT) 4630 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4631 if (vp->v_iflag & VI_DEFINACT) 4632 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4633 if (vp->v_iflag & VI_FOPENING) 4634 strlcat(buf, "|VI_FOPENING", sizeof(buf)); 4635 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | 4636 VI_OWEINACT | VI_DEFINACT | VI_FOPENING); 4637 if (flags != 0) { 4638 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4639 strlcat(buf, buf2, sizeof(buf)); 4640 } 4641 if (vp->v_mflag & VMP_LAZYLIST) 4642 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4643 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4644 if (flags != 0) { 4645 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4646 strlcat(buf, buf2, sizeof(buf)); 4647 } 4648 printf(" flags (%s)", buf + 1); 4649 if (mtx_owned(VI_MTX(vp))) 4650 printf(" VI_LOCKed"); 4651 printf("\n"); 4652 if (vp->v_object != NULL) 4653 printf(" v_object %p ref %d pages %d " 4654 "cleanbuf %d dirtybuf %d\n", 4655 vp->v_object, vp->v_object->ref_count, 4656 vp->v_object->resident_page_count, 4657 vp->v_bufobj.bo_clean.bv_cnt, 4658 vp->v_bufobj.bo_dirty.bv_cnt); 4659 printf(" "); 4660 lockmgr_printinfo(vp->v_vnlock); 4661 if (vp->v_data != NULL) 4662 VOP_PRINT(vp); 4663 } 4664 4665 #ifdef DDB 4666 /* 4667 * List all of the locked vnodes in the system. 4668 * Called when debugging the kernel. 4669 */ 4670 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) 4671 { 4672 struct mount *mp; 4673 struct vnode *vp; 4674 4675 /* 4676 * Note: because this is DDB, we can't obey the locking semantics 4677 * for these structures, which means we could catch an inconsistent 4678 * state and dereference a nasty pointer. Not much to be done 4679 * about that. 4680 */ 4681 db_printf("Locked vnodes\n"); 4682 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4683 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4684 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4685 vn_printf(vp, "vnode "); 4686 } 4687 } 4688 } 4689 4690 /* 4691 * Show details about the given vnode. 4692 */ 4693 DB_SHOW_COMMAND(vnode, db_show_vnode) 4694 { 4695 struct vnode *vp; 4696 4697 if (!have_addr) 4698 return; 4699 vp = (struct vnode *)addr; 4700 vn_printf(vp, "vnode "); 4701 } 4702 4703 /* 4704 * Show details about the given mount point. 4705 */ 4706 DB_SHOW_COMMAND(mount, db_show_mount) 4707 { 4708 struct mount *mp; 4709 struct vfsopt *opt; 4710 struct statfs *sp; 4711 struct vnode *vp; 4712 char buf[512]; 4713 uint64_t mflags; 4714 u_int flags; 4715 4716 if (!have_addr) { 4717 /* No address given, print short info about all mount points. */ 4718 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4719 db_printf("%p %s on %s (%s)\n", mp, 4720 mp->mnt_stat.f_mntfromname, 4721 mp->mnt_stat.f_mntonname, 4722 mp->mnt_stat.f_fstypename); 4723 if (db_pager_quit) 4724 break; 4725 } 4726 db_printf("\nMore info: show mount <addr>\n"); 4727 return; 4728 } 4729 4730 mp = (struct mount *)addr; 4731 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4732 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4733 4734 buf[0] = '\0'; 4735 mflags = mp->mnt_flag; 4736 #define MNT_FLAG(flag) do { \ 4737 if (mflags & (flag)) { \ 4738 if (buf[0] != '\0') \ 4739 strlcat(buf, ", ", sizeof(buf)); \ 4740 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4741 mflags &= ~(flag); \ 4742 } \ 4743 } while (0) 4744 MNT_FLAG(MNT_RDONLY); 4745 MNT_FLAG(MNT_SYNCHRONOUS); 4746 MNT_FLAG(MNT_NOEXEC); 4747 MNT_FLAG(MNT_NOSUID); 4748 MNT_FLAG(MNT_NFS4ACLS); 4749 MNT_FLAG(MNT_UNION); 4750 MNT_FLAG(MNT_ASYNC); 4751 MNT_FLAG(MNT_SUIDDIR); 4752 MNT_FLAG(MNT_SOFTDEP); 4753 MNT_FLAG(MNT_NOSYMFOLLOW); 4754 MNT_FLAG(MNT_GJOURNAL); 4755 MNT_FLAG(MNT_MULTILABEL); 4756 MNT_FLAG(MNT_ACLS); 4757 MNT_FLAG(MNT_NOATIME); 4758 MNT_FLAG(MNT_NOCLUSTERR); 4759 MNT_FLAG(MNT_NOCLUSTERW); 4760 MNT_FLAG(MNT_SUJ); 4761 MNT_FLAG(MNT_EXRDONLY); 4762 MNT_FLAG(MNT_EXPORTED); 4763 MNT_FLAG(MNT_DEFEXPORTED); 4764 MNT_FLAG(MNT_EXPORTANON); 4765 MNT_FLAG(MNT_EXKERB); 4766 MNT_FLAG(MNT_EXPUBLIC); 4767 MNT_FLAG(MNT_LOCAL); 4768 MNT_FLAG(MNT_QUOTA); 4769 MNT_FLAG(MNT_ROOTFS); 4770 MNT_FLAG(MNT_USER); 4771 MNT_FLAG(MNT_IGNORE); 4772 MNT_FLAG(MNT_UPDATE); 4773 MNT_FLAG(MNT_DELEXPORT); 4774 MNT_FLAG(MNT_RELOAD); 4775 MNT_FLAG(MNT_FORCE); 4776 MNT_FLAG(MNT_SNAPSHOT); 4777 MNT_FLAG(MNT_BYFSID); 4778 MNT_FLAG(MNT_NAMEDATTR); 4779 #undef MNT_FLAG 4780 if (mflags != 0) { 4781 if (buf[0] != '\0') 4782 strlcat(buf, ", ", sizeof(buf)); 4783 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4784 "0x%016jx", mflags); 4785 } 4786 db_printf(" mnt_flag = %s\n", buf); 4787 4788 buf[0] = '\0'; 4789 flags = mp->mnt_kern_flag; 4790 #define MNT_KERN_FLAG(flag) do { \ 4791 if (flags & (flag)) { \ 4792 if (buf[0] != '\0') \ 4793 strlcat(buf, ", ", sizeof(buf)); \ 4794 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4795 flags &= ~(flag); \ 4796 } \ 4797 } while (0) 4798 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4799 MNT_KERN_FLAG(MNTK_ASYNC); 4800 MNT_KERN_FLAG(MNTK_SOFTDEP); 4801 MNT_KERN_FLAG(MNTK_NOMSYNC); 4802 MNT_KERN_FLAG(MNTK_DRAINING); 4803 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4804 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4805 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4806 MNT_KERN_FLAG(MNTK_NO_IOPF); 4807 MNT_KERN_FLAG(MNTK_RECURSE); 4808 MNT_KERN_FLAG(MNTK_UPPER_WAITER); 4809 MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); 4810 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4811 MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); 4812 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4813 MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); 4814 MNT_KERN_FLAG(MNTK_NOASYNC); 4815 MNT_KERN_FLAG(MNTK_UNMOUNT); 4816 MNT_KERN_FLAG(MNTK_MWAIT); 4817 MNT_KERN_FLAG(MNTK_SUSPEND); 4818 MNT_KERN_FLAG(MNTK_SUSPEND2); 4819 MNT_KERN_FLAG(MNTK_SUSPENDED); 4820 MNT_KERN_FLAG(MNTK_NULL_NOCACHE); 4821 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4822 #undef MNT_KERN_FLAG 4823 if (flags != 0) { 4824 if (buf[0] != '\0') 4825 strlcat(buf, ", ", sizeof(buf)); 4826 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4827 "0x%08x", flags); 4828 } 4829 db_printf(" mnt_kern_flag = %s\n", buf); 4830 4831 db_printf(" mnt_opt = "); 4832 opt = TAILQ_FIRST(mp->mnt_opt); 4833 if (opt != NULL) { 4834 db_printf("%s", opt->name); 4835 opt = TAILQ_NEXT(opt, link); 4836 while (opt != NULL) { 4837 db_printf(", %s", opt->name); 4838 opt = TAILQ_NEXT(opt, link); 4839 } 4840 } 4841 db_printf("\n"); 4842 4843 sp = &mp->mnt_stat; 4844 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4845 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4846 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4847 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4848 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4849 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4850 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4851 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4852 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4853 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4854 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4855 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4856 4857 db_printf(" mnt_cred = { uid=%u ruid=%u", 4858 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4859 if (jailed(mp->mnt_cred)) 4860 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4861 db_printf(" }\n"); 4862 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4863 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4864 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4865 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4866 db_printf(" mnt_lazyvnodelistsize = %d\n", 4867 mp->mnt_lazyvnodelistsize); 4868 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4869 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4870 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4871 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4872 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4873 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4874 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4875 db_printf(" mnt_secondary_accwrites = %d\n", 4876 mp->mnt_secondary_accwrites); 4877 db_printf(" mnt_gjprovider = %s\n", 4878 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4879 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4880 4881 db_printf("\n\nList of active vnodes\n"); 4882 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4883 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4884 vn_printf(vp, "vnode "); 4885 if (db_pager_quit) 4886 break; 4887 } 4888 } 4889 db_printf("\n\nList of inactive vnodes\n"); 4890 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4891 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4892 vn_printf(vp, "vnode "); 4893 if (db_pager_quit) 4894 break; 4895 } 4896 } 4897 } 4898 #endif /* DDB */ 4899 4900 /* 4901 * Fill in a struct xvfsconf based on a struct vfsconf. 4902 */ 4903 static int 4904 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4905 { 4906 struct xvfsconf xvfsp; 4907 4908 bzero(&xvfsp, sizeof(xvfsp)); 4909 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4910 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4911 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4912 xvfsp.vfc_flags = vfsp->vfc_flags; 4913 /* 4914 * These are unused in userland, we keep them 4915 * to not break binary compatibility. 4916 */ 4917 xvfsp.vfc_vfsops = NULL; 4918 xvfsp.vfc_next = NULL; 4919 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4920 } 4921 4922 #ifdef COMPAT_FREEBSD32 4923 struct xvfsconf32 { 4924 uint32_t vfc_vfsops; 4925 char vfc_name[MFSNAMELEN]; 4926 int32_t vfc_typenum; 4927 int32_t vfc_refcount; 4928 int32_t vfc_flags; 4929 uint32_t vfc_next; 4930 }; 4931 4932 static int 4933 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4934 { 4935 struct xvfsconf32 xvfsp; 4936 4937 bzero(&xvfsp, sizeof(xvfsp)); 4938 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4939 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4940 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4941 xvfsp.vfc_flags = vfsp->vfc_flags; 4942 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4943 } 4944 #endif 4945 4946 /* 4947 * Top level filesystem related information gathering. 4948 */ 4949 static int 4950 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4951 { 4952 struct vfsconf *vfsp; 4953 int error; 4954 4955 error = 0; 4956 vfsconf_slock(); 4957 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4958 #ifdef COMPAT_FREEBSD32 4959 if (req->flags & SCTL_MASK32) 4960 error = vfsconf2x32(req, vfsp); 4961 else 4962 #endif 4963 error = vfsconf2x(req, vfsp); 4964 if (error) 4965 break; 4966 } 4967 vfsconf_sunlock(); 4968 return (error); 4969 } 4970 4971 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4972 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4973 "S,xvfsconf", "List of all configured filesystems"); 4974 4975 #ifndef BURN_BRIDGES 4976 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4977 4978 static int 4979 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4980 { 4981 int *name = (int *)arg1 - 1; /* XXX */ 4982 u_int namelen = arg2 + 1; /* XXX */ 4983 struct vfsconf *vfsp; 4984 4985 log(LOG_WARNING, "userland calling deprecated sysctl, " 4986 "please rebuild world\n"); 4987 4988 #if 1 || defined(COMPAT_PRELITE2) 4989 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4990 if (namelen == 1) 4991 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4992 #endif 4993 4994 switch (name[1]) { 4995 case VFS_MAXTYPENUM: 4996 if (namelen != 2) 4997 return (ENOTDIR); 4998 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4999 case VFS_CONF: 5000 if (namelen != 3) 5001 return (ENOTDIR); /* overloaded */ 5002 vfsconf_slock(); 5003 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 5004 if (vfsp->vfc_typenum == name[2]) 5005 break; 5006 } 5007 vfsconf_sunlock(); 5008 if (vfsp == NULL) 5009 return (EOPNOTSUPP); 5010 #ifdef COMPAT_FREEBSD32 5011 if (req->flags & SCTL_MASK32) 5012 return (vfsconf2x32(req, vfsp)); 5013 else 5014 #endif 5015 return (vfsconf2x(req, vfsp)); 5016 } 5017 return (EOPNOTSUPP); 5018 } 5019 5020 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 5021 CTLFLAG_MPSAFE, vfs_sysctl, 5022 "Generic filesystem"); 5023 5024 #if 1 || defined(COMPAT_PRELITE2) 5025 5026 static int 5027 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 5028 { 5029 int error; 5030 struct vfsconf *vfsp; 5031 struct ovfsconf ovfs; 5032 5033 vfsconf_slock(); 5034 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 5035 bzero(&ovfs, sizeof(ovfs)); 5036 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 5037 strcpy(ovfs.vfc_name, vfsp->vfc_name); 5038 ovfs.vfc_index = vfsp->vfc_typenum; 5039 ovfs.vfc_refcount = vfsp->vfc_refcount; 5040 ovfs.vfc_flags = vfsp->vfc_flags; 5041 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 5042 if (error != 0) { 5043 vfsconf_sunlock(); 5044 return (error); 5045 } 5046 } 5047 vfsconf_sunlock(); 5048 return (0); 5049 } 5050 5051 #endif /* 1 || COMPAT_PRELITE2 */ 5052 #endif /* !BURN_BRIDGES */ 5053 5054 static void 5055 unmount_or_warn(struct mount *mp) 5056 { 5057 int error; 5058 5059 error = dounmount(mp, MNT_FORCE, curthread); 5060 if (error != 0) { 5061 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 5062 if (error == EBUSY) 5063 printf("BUSY)\n"); 5064 else 5065 printf("%d)\n", error); 5066 } 5067 } 5068 5069 /* 5070 * Unmount all filesystems. The list is traversed in reverse order 5071 * of mounting to avoid dependencies. 5072 */ 5073 void 5074 vfs_unmountall(void) 5075 { 5076 struct mount *mp, *tmp; 5077 5078 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 5079 5080 /* 5081 * Since this only runs when rebooting, it is not interlocked. 5082 */ 5083 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 5084 vfs_ref(mp); 5085 5086 /* 5087 * Forcibly unmounting "/dev" before "/" would prevent clean 5088 * unmount of the latter. 5089 */ 5090 if (mp == rootdevmp) 5091 continue; 5092 5093 unmount_or_warn(mp); 5094 } 5095 5096 if (rootdevmp != NULL) 5097 unmount_or_warn(rootdevmp); 5098 } 5099 5100 static void 5101 vfs_deferred_inactive(struct vnode *vp, int lkflags) 5102 { 5103 5104 ASSERT_VI_LOCKED(vp, __func__); 5105 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 5106 if ((vp->v_iflag & VI_OWEINACT) == 0) { 5107 vdropl(vp); 5108 return; 5109 } 5110 if (vn_lock(vp, lkflags) == 0) { 5111 VI_LOCK(vp); 5112 vinactive(vp); 5113 VOP_UNLOCK(vp); 5114 vdropl(vp); 5115 return; 5116 } 5117 vdefer_inactive_unlocked(vp); 5118 } 5119 5120 static int 5121 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 5122 { 5123 5124 return (vp->v_iflag & VI_DEFINACT); 5125 } 5126 5127 static void __noinline 5128 vfs_periodic_inactive(struct mount *mp, int flags) 5129 { 5130 struct vnode *vp, *mvp; 5131 int lkflags; 5132 5133 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 5134 if (flags != MNT_WAIT) 5135 lkflags |= LK_NOWAIT; 5136 5137 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 5138 if ((vp->v_iflag & VI_DEFINACT) == 0) { 5139 VI_UNLOCK(vp); 5140 continue; 5141 } 5142 vp->v_iflag &= ~VI_DEFINACT; 5143 vfs_deferred_inactive(vp, lkflags); 5144 } 5145 } 5146 5147 static inline bool 5148 vfs_want_msync(struct vnode *vp) 5149 { 5150 struct vm_object *obj; 5151 5152 /* 5153 * This test may be performed without any locks held. 5154 * We rely on vm_object's type stability. 5155 */ 5156 if (vp->v_vflag & VV_NOSYNC) 5157 return (false); 5158 obj = vp->v_object; 5159 return (obj != NULL && vm_object_mightbedirty(obj)); 5160 } 5161 5162 static int 5163 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 5164 { 5165 5166 if (vp->v_vflag & VV_NOSYNC) 5167 return (false); 5168 if (vp->v_iflag & VI_DEFINACT) 5169 return (true); 5170 return (vfs_want_msync(vp)); 5171 } 5172 5173 static void __noinline 5174 vfs_periodic_msync_inactive(struct mount *mp, int flags) 5175 { 5176 struct vnode *vp, *mvp; 5177 int lkflags; 5178 bool seen_defer; 5179 5180 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 5181 if (flags != MNT_WAIT) 5182 lkflags |= LK_NOWAIT; 5183 5184 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 5185 seen_defer = false; 5186 if (vp->v_iflag & VI_DEFINACT) { 5187 vp->v_iflag &= ~VI_DEFINACT; 5188 seen_defer = true; 5189 } 5190 if (!vfs_want_msync(vp)) { 5191 if (seen_defer) 5192 vfs_deferred_inactive(vp, lkflags); 5193 else 5194 VI_UNLOCK(vp); 5195 continue; 5196 } 5197 if (vget(vp, lkflags) == 0) { 5198 if ((vp->v_vflag & VV_NOSYNC) == 0) { 5199 if (flags == MNT_WAIT) 5200 vnode_pager_clean_sync(vp); 5201 else 5202 vnode_pager_clean_async(vp); 5203 } 5204 vput(vp); 5205 if (seen_defer) 5206 vdrop(vp); 5207 } else { 5208 if (seen_defer) 5209 vdefer_inactive_unlocked(vp); 5210 } 5211 } 5212 } 5213 5214 void 5215 vfs_periodic(struct mount *mp, int flags) 5216 { 5217 5218 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 5219 5220 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 5221 vfs_periodic_inactive(mp, flags); 5222 else 5223 vfs_periodic_msync_inactive(mp, flags); 5224 } 5225 5226 static void 5227 destroy_vpollinfo_free(struct vpollinfo *vi) 5228 { 5229 5230 knlist_destroy(&vi->vpi_selinfo.si_note); 5231 mtx_destroy(&vi->vpi_lock); 5232 free(vi, M_VNODEPOLL); 5233 } 5234 5235 static void 5236 destroy_vpollinfo(struct vpollinfo *vi) 5237 { 5238 5239 knlist_clear(&vi->vpi_selinfo.si_note, 1); 5240 seldrain(&vi->vpi_selinfo); 5241 destroy_vpollinfo_free(vi); 5242 } 5243 5244 /* 5245 * Initialize per-vnode helper structure to hold poll-related state. 5246 */ 5247 void 5248 v_addpollinfo(struct vnode *vp) 5249 { 5250 struct vpollinfo *vi; 5251 5252 if (vp->v_pollinfo != NULL) 5253 return; 5254 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); 5255 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 5256 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 5257 vfs_knlunlock, vfs_knl_assert_lock); 5258 VI_LOCK(vp); 5259 if (vp->v_pollinfo != NULL) { 5260 VI_UNLOCK(vp); 5261 destroy_vpollinfo_free(vi); 5262 return; 5263 } 5264 vp->v_pollinfo = vi; 5265 VI_UNLOCK(vp); 5266 } 5267 5268 /* 5269 * Record a process's interest in events which might happen to 5270 * a vnode. Because poll uses the historic select-style interface 5271 * internally, this routine serves as both the ``check for any 5272 * pending events'' and the ``record my interest in future events'' 5273 * functions. (These are done together, while the lock is held, 5274 * to avoid race conditions.) 5275 */ 5276 int 5277 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 5278 { 5279 5280 v_addpollinfo(vp); 5281 mtx_lock(&vp->v_pollinfo->vpi_lock); 5282 if (vp->v_pollinfo->vpi_revents & events) { 5283 /* 5284 * This leaves events we are not interested 5285 * in available for the other process which 5286 * which presumably had requested them 5287 * (otherwise they would never have been 5288 * recorded). 5289 */ 5290 events &= vp->v_pollinfo->vpi_revents; 5291 vp->v_pollinfo->vpi_revents &= ~events; 5292 5293 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5294 return (events); 5295 } 5296 vp->v_pollinfo->vpi_events |= events; 5297 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 5298 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5299 return (0); 5300 } 5301 5302 /* 5303 * Routine to create and manage a filesystem syncer vnode. 5304 */ 5305 #define sync_close ((int (*)(struct vop_close_args *))nullop) 5306 static int sync_fsync(struct vop_fsync_args *); 5307 static int sync_inactive(struct vop_inactive_args *); 5308 static int sync_reclaim(struct vop_reclaim_args *); 5309 5310 static struct vop_vector sync_vnodeops = { 5311 .vop_bypass = VOP_EOPNOTSUPP, 5312 .vop_close = sync_close, 5313 .vop_fsync = sync_fsync, 5314 .vop_getwritemount = vop_stdgetwritemount, 5315 .vop_inactive = sync_inactive, 5316 .vop_need_inactive = vop_stdneed_inactive, 5317 .vop_reclaim = sync_reclaim, 5318 .vop_lock1 = vop_stdlock, 5319 .vop_unlock = vop_stdunlock, 5320 .vop_islocked = vop_stdislocked, 5321 .vop_fplookup_vexec = VOP_EAGAIN, 5322 .vop_fplookup_symlink = VOP_EAGAIN, 5323 }; 5324 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 5325 5326 /* 5327 * Create a new filesystem syncer vnode for the specified mount point. 5328 */ 5329 void 5330 vfs_allocate_syncvnode(struct mount *mp) 5331 { 5332 struct vnode *vp; 5333 struct bufobj *bo; 5334 static long start, incr, next; 5335 int error; 5336 5337 /* Allocate a new vnode */ 5338 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 5339 if (error != 0) 5340 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 5341 vp->v_type = VNON; 5342 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5343 vp->v_vflag |= VV_FORCEINSMQ; 5344 error = insmntque1(vp, mp); 5345 if (error != 0) 5346 panic("vfs_allocate_syncvnode: insmntque() failed"); 5347 vp->v_vflag &= ~VV_FORCEINSMQ; 5348 vn_set_state(vp, VSTATE_CONSTRUCTED); 5349 VOP_UNLOCK(vp); 5350 /* 5351 * Place the vnode onto the syncer worklist. We attempt to 5352 * scatter them about on the list so that they will go off 5353 * at evenly distributed times even if all the filesystems 5354 * are mounted at once. 5355 */ 5356 next += incr; 5357 if (next == 0 || next > syncer_maxdelay) { 5358 start /= 2; 5359 incr /= 2; 5360 if (start == 0) { 5361 start = syncer_maxdelay / 2; 5362 incr = syncer_maxdelay; 5363 } 5364 next = start; 5365 } 5366 bo = &vp->v_bufobj; 5367 BO_LOCK(bo); 5368 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5369 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5370 mtx_lock(&sync_mtx); 5371 sync_vnode_count++; 5372 if (mp->mnt_syncer == NULL) { 5373 mp->mnt_syncer = vp; 5374 vp = NULL; 5375 } 5376 mtx_unlock(&sync_mtx); 5377 BO_UNLOCK(bo); 5378 if (vp != NULL) { 5379 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5380 vgone(vp); 5381 vput(vp); 5382 } 5383 } 5384 5385 void 5386 vfs_deallocate_syncvnode(struct mount *mp) 5387 { 5388 struct vnode *vp; 5389 5390 mtx_lock(&sync_mtx); 5391 vp = mp->mnt_syncer; 5392 if (vp != NULL) 5393 mp->mnt_syncer = NULL; 5394 mtx_unlock(&sync_mtx); 5395 if (vp != NULL) 5396 vrele(vp); 5397 } 5398 5399 /* 5400 * Do a lazy sync of the filesystem. 5401 */ 5402 static int 5403 sync_fsync(struct vop_fsync_args *ap) 5404 { 5405 struct vnode *syncvp = ap->a_vp; 5406 struct mount *mp = syncvp->v_mount; 5407 int error, save; 5408 struct bufobj *bo; 5409 5410 /* 5411 * We only need to do something if this is a lazy evaluation. 5412 */ 5413 if (ap->a_waitfor != MNT_LAZY) 5414 return (0); 5415 5416 /* 5417 * Move ourselves to the back of the sync list. 5418 */ 5419 bo = &syncvp->v_bufobj; 5420 BO_LOCK(bo); 5421 vn_syncer_add_to_worklist(bo, syncdelay); 5422 BO_UNLOCK(bo); 5423 5424 /* 5425 * Walk the list of vnodes pushing all that are dirty and 5426 * not already on the sync list. 5427 */ 5428 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5429 return (0); 5430 VOP_UNLOCK(syncvp); 5431 save = curthread_pflags_set(TDP_SYNCIO); 5432 /* 5433 * The filesystem at hand may be idle with free vnodes stored in the 5434 * batch. Return them instead of letting them stay there indefinitely. 5435 */ 5436 vfs_periodic(mp, MNT_NOWAIT); 5437 error = VFS_SYNC(mp, MNT_LAZY); 5438 curthread_pflags_restore(save); 5439 vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); 5440 vfs_unbusy(mp); 5441 return (error); 5442 } 5443 5444 /* 5445 * The syncer vnode is no referenced. 5446 */ 5447 static int 5448 sync_inactive(struct vop_inactive_args *ap) 5449 { 5450 5451 vgone(ap->a_vp); 5452 return (0); 5453 } 5454 5455 /* 5456 * The syncer vnode is no longer needed and is being decommissioned. 5457 * 5458 * Modifications to the worklist must be protected by sync_mtx. 5459 */ 5460 static int 5461 sync_reclaim(struct vop_reclaim_args *ap) 5462 { 5463 struct vnode *vp = ap->a_vp; 5464 struct bufobj *bo; 5465 5466 bo = &vp->v_bufobj; 5467 BO_LOCK(bo); 5468 mtx_lock(&sync_mtx); 5469 if (vp->v_mount->mnt_syncer == vp) 5470 vp->v_mount->mnt_syncer = NULL; 5471 if (bo->bo_flag & BO_ONWORKLST) { 5472 LIST_REMOVE(bo, bo_synclist); 5473 syncer_worklist_len--; 5474 sync_vnode_count--; 5475 bo->bo_flag &= ~BO_ONWORKLST; 5476 } 5477 mtx_unlock(&sync_mtx); 5478 BO_UNLOCK(bo); 5479 5480 return (0); 5481 } 5482 5483 int 5484 vn_need_pageq_flush(struct vnode *vp) 5485 { 5486 struct vm_object *obj; 5487 5488 obj = vp->v_object; 5489 return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5490 vm_object_mightbedirty(obj)); 5491 } 5492 5493 /* 5494 * Check if vnode represents a disk device 5495 */ 5496 bool 5497 vn_isdisk_error(struct vnode *vp, int *errp) 5498 { 5499 int error; 5500 5501 if (vp->v_type != VCHR) { 5502 error = ENOTBLK; 5503 goto out; 5504 } 5505 error = 0; 5506 dev_lock(); 5507 if (vp->v_rdev == NULL) 5508 error = ENXIO; 5509 else if (vp->v_rdev->si_devsw == NULL) 5510 error = ENXIO; 5511 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5512 error = ENOTBLK; 5513 dev_unlock(); 5514 out: 5515 *errp = error; 5516 return (error == 0); 5517 } 5518 5519 bool 5520 vn_isdisk(struct vnode *vp) 5521 { 5522 int error; 5523 5524 return (vn_isdisk_error(vp, &error)); 5525 } 5526 5527 /* 5528 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5529 * the comment above cache_fplookup for details. 5530 */ 5531 int 5532 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5533 { 5534 int error; 5535 5536 VFS_SMR_ASSERT_ENTERED(); 5537 5538 /* Check the owner. */ 5539 if (cred->cr_uid == file_uid) { 5540 if (file_mode & S_IXUSR) 5541 return (0); 5542 goto out_error; 5543 } 5544 5545 /* Otherwise, check the groups (first match) */ 5546 if (groupmember(file_gid, cred)) { 5547 if (file_mode & S_IXGRP) 5548 return (0); 5549 goto out_error; 5550 } 5551 5552 /* Otherwise, check everyone else. */ 5553 if (file_mode & S_IXOTH) 5554 return (0); 5555 out_error: 5556 /* 5557 * Permission check failed, but it is possible denial will get overwritten 5558 * (e.g., when root is traversing through a 700 directory owned by someone 5559 * else). 5560 * 5561 * vaccess() calls priv_check_cred which in turn can descent into MAC 5562 * modules overriding this result. It's quite unclear what semantics 5563 * are allowed for them to operate, thus for safety we don't call them 5564 * from within the SMR section. This also means if any such modules 5565 * are present, we have to let the regular lookup decide. 5566 */ 5567 error = priv_check_cred_vfs_lookup_nomac(cred); 5568 switch (error) { 5569 case 0: 5570 return (0); 5571 case EAGAIN: 5572 /* 5573 * MAC modules present. 5574 */ 5575 return (EAGAIN); 5576 case EPERM: 5577 return (EACCES); 5578 default: 5579 return (error); 5580 } 5581 } 5582 5583 /* 5584 * Common filesystem object access control check routine. Accepts a 5585 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5586 * Returns 0 on success, or an errno on failure. 5587 */ 5588 int 5589 vaccess(__enum_uint8(vtype) type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5590 accmode_t accmode, struct ucred *cred) 5591 { 5592 accmode_t dac_granted; 5593 accmode_t priv_granted; 5594 5595 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5596 ("invalid bit in accmode")); 5597 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5598 ("VAPPEND without VWRITE")); 5599 5600 /* 5601 * Look for a normal, non-privileged way to access the file/directory 5602 * as requested. If it exists, go with that. 5603 */ 5604 5605 dac_granted = 0; 5606 5607 /* Check the owner. */ 5608 if (cred->cr_uid == file_uid) { 5609 dac_granted |= VADMIN; 5610 if (file_mode & S_IXUSR) 5611 dac_granted |= VEXEC; 5612 if (file_mode & S_IRUSR) 5613 dac_granted |= VREAD; 5614 if (file_mode & S_IWUSR) 5615 dac_granted |= (VWRITE | VAPPEND); 5616 5617 if ((accmode & dac_granted) == accmode) 5618 return (0); 5619 5620 goto privcheck; 5621 } 5622 5623 /* Otherwise, check the groups (first match) */ 5624 if (groupmember(file_gid, cred)) { 5625 if (file_mode & S_IXGRP) 5626 dac_granted |= VEXEC; 5627 if (file_mode & S_IRGRP) 5628 dac_granted |= VREAD; 5629 if (file_mode & S_IWGRP) 5630 dac_granted |= (VWRITE | VAPPEND); 5631 5632 if ((accmode & dac_granted) == accmode) 5633 return (0); 5634 5635 goto privcheck; 5636 } 5637 5638 /* Otherwise, check everyone else. */ 5639 if (file_mode & S_IXOTH) 5640 dac_granted |= VEXEC; 5641 if (file_mode & S_IROTH) 5642 dac_granted |= VREAD; 5643 if (file_mode & S_IWOTH) 5644 dac_granted |= (VWRITE | VAPPEND); 5645 if ((accmode & dac_granted) == accmode) 5646 return (0); 5647 5648 privcheck: 5649 /* 5650 * Build a privilege mask to determine if the set of privileges 5651 * satisfies the requirements when combined with the granted mask 5652 * from above. For each privilege, if the privilege is required, 5653 * bitwise or the request type onto the priv_granted mask. 5654 */ 5655 priv_granted = 0; 5656 5657 if (type == VDIR) { 5658 /* 5659 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5660 * requests, instead of PRIV_VFS_EXEC. 5661 */ 5662 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5663 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5664 priv_granted |= VEXEC; 5665 } else { 5666 /* 5667 * Ensure that at least one execute bit is on. Otherwise, 5668 * a privileged user will always succeed, and we don't want 5669 * this to happen unless the file really is executable. 5670 */ 5671 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5672 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5673 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5674 priv_granted |= VEXEC; 5675 } 5676 5677 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5678 !priv_check_cred(cred, PRIV_VFS_READ)) 5679 priv_granted |= VREAD; 5680 5681 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5682 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5683 priv_granted |= (VWRITE | VAPPEND); 5684 5685 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5686 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5687 priv_granted |= VADMIN; 5688 5689 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5690 return (0); 5691 } 5692 5693 return ((accmode & VADMIN) ? EPERM : EACCES); 5694 } 5695 5696 /* 5697 * Credential check based on process requesting service, and per-attribute 5698 * permissions. 5699 */ 5700 int 5701 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5702 struct thread *td, accmode_t accmode) 5703 { 5704 5705 /* 5706 * Kernel-invoked always succeeds. 5707 */ 5708 if (cred == NOCRED) 5709 return (0); 5710 5711 /* 5712 * Do not allow privileged processes in jail to directly manipulate 5713 * system attributes. 5714 */ 5715 switch (attrnamespace) { 5716 case EXTATTR_NAMESPACE_SYSTEM: 5717 /* Potentially should be: return (EPERM); */ 5718 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5719 case EXTATTR_NAMESPACE_USER: 5720 return (VOP_ACCESS(vp, accmode, cred, td)); 5721 default: 5722 return (EPERM); 5723 } 5724 } 5725 5726 #ifdef DEBUG_VFS_LOCKS 5727 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5728 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5729 "Drop into debugger on lock violation"); 5730 5731 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5732 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5733 0, "Check for interlock across VOPs"); 5734 5735 int vfs_badlock_print = 1; /* Print lock violations. */ 5736 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5737 0, "Print lock violations"); 5738 5739 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5740 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5741 0, "Print vnode details on lock violations"); 5742 5743 #ifdef KDB 5744 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5745 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5746 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5747 #endif 5748 5749 static void 5750 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5751 { 5752 5753 #ifdef KDB 5754 if (vfs_badlock_backtrace) 5755 kdb_backtrace(); 5756 #endif 5757 if (vfs_badlock_vnode) 5758 vn_printf(vp, "vnode "); 5759 if (vfs_badlock_print) 5760 printf("%s: %p %s\n", str, (void *)vp, msg); 5761 if (vfs_badlock_ddb) 5762 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5763 } 5764 5765 void 5766 assert_vi_locked(struct vnode *vp, const char *str) 5767 { 5768 5769 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5770 vfs_badlock("interlock is not locked but should be", str, vp); 5771 } 5772 5773 void 5774 assert_vi_unlocked(struct vnode *vp, const char *str) 5775 { 5776 5777 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5778 vfs_badlock("interlock is locked but should not be", str, vp); 5779 } 5780 5781 void 5782 assert_vop_locked(struct vnode *vp, const char *str) 5783 { 5784 if (KERNEL_PANICKED() || vp == NULL) 5785 return; 5786 5787 #ifdef WITNESS 5788 if ((vp->v_irflag & VIRF_CROSSMP) == 0 && 5789 witness_is_owned(&vp->v_vnlock->lock_object) == -1) 5790 #else 5791 int locked = VOP_ISLOCKED(vp); 5792 if (locked == 0 || locked == LK_EXCLOTHER) 5793 #endif 5794 vfs_badlock("is not locked but should be", str, vp); 5795 } 5796 5797 void 5798 assert_vop_unlocked(struct vnode *vp, const char *str) 5799 { 5800 if (KERNEL_PANICKED() || vp == NULL) 5801 return; 5802 5803 #ifdef WITNESS 5804 if ((vp->v_irflag & VIRF_CROSSMP) == 0 && 5805 witness_is_owned(&vp->v_vnlock->lock_object) == 1) 5806 #else 5807 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5808 #endif 5809 vfs_badlock("is locked but should not be", str, vp); 5810 } 5811 5812 void 5813 assert_vop_elocked(struct vnode *vp, const char *str) 5814 { 5815 if (KERNEL_PANICKED() || vp == NULL) 5816 return; 5817 5818 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5819 vfs_badlock("is not exclusive locked but should be", str, vp); 5820 } 5821 #endif /* DEBUG_VFS_LOCKS */ 5822 5823 void 5824 vop_rename_fail(struct vop_rename_args *ap) 5825 { 5826 5827 if (ap->a_tvp != NULL) 5828 vput(ap->a_tvp); 5829 if (ap->a_tdvp == ap->a_tvp) 5830 vrele(ap->a_tdvp); 5831 else 5832 vput(ap->a_tdvp); 5833 vrele(ap->a_fdvp); 5834 vrele(ap->a_fvp); 5835 } 5836 5837 void 5838 vop_rename_pre(void *ap) 5839 { 5840 struct vop_rename_args *a = ap; 5841 5842 #ifdef DEBUG_VFS_LOCKS 5843 if (a->a_tvp) 5844 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5845 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5846 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5847 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5848 5849 /* Check the source (from). */ 5850 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5851 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5852 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5853 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5854 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5855 5856 /* Check the target. */ 5857 if (a->a_tvp) 5858 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5859 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5860 #endif 5861 /* 5862 * It may be tempting to add vn_seqc_write_begin/end calls here and 5863 * in vop_rename_post but that's not going to work out since some 5864 * filesystems relookup vnodes mid-rename. This is probably a bug. 5865 * 5866 * For now filesystems are expected to do the relevant calls after they 5867 * decide what vnodes to operate on. 5868 */ 5869 if (a->a_tdvp != a->a_fdvp) 5870 vhold(a->a_fdvp); 5871 if (a->a_tvp != a->a_fvp) 5872 vhold(a->a_fvp); 5873 vhold(a->a_tdvp); 5874 if (a->a_tvp) 5875 vhold(a->a_tvp); 5876 } 5877 5878 #ifdef DEBUG_VFS_LOCKS 5879 void 5880 vop_fplookup_vexec_debugpre(void *ap __unused) 5881 { 5882 5883 VFS_SMR_ASSERT_ENTERED(); 5884 } 5885 5886 void 5887 vop_fplookup_vexec_debugpost(void *ap, int rc) 5888 { 5889 struct vop_fplookup_vexec_args *a; 5890 struct vnode *vp; 5891 5892 a = ap; 5893 vp = a->a_vp; 5894 5895 VFS_SMR_ASSERT_ENTERED(); 5896 if (rc == EOPNOTSUPP) 5897 VNPASS(VN_IS_DOOMED(vp), vp); 5898 } 5899 5900 void 5901 vop_fplookup_symlink_debugpre(void *ap __unused) 5902 { 5903 5904 VFS_SMR_ASSERT_ENTERED(); 5905 } 5906 5907 void 5908 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) 5909 { 5910 5911 VFS_SMR_ASSERT_ENTERED(); 5912 } 5913 5914 static void 5915 vop_fsync_debugprepost(struct vnode *vp, const char *name) 5916 { 5917 if (vp->v_type == VCHR) 5918 ; 5919 /* 5920 * The shared vs. exclusive locking policy for fsync() 5921 * is actually determined by vp's write mount as indicated 5922 * by VOP_GETWRITEMOUNT(), which for stacked filesystems 5923 * may not be the same as vp->v_mount. However, if the 5924 * underlying filesystem which really handles the fsync() 5925 * supports shared locking, the stacked filesystem must also 5926 * be prepared for its VOP_FSYNC() operation to be called 5927 * with only a shared lock. On the other hand, if the 5928 * stacked filesystem claims support for shared write 5929 * locking but the underlying filesystem does not, and the 5930 * caller incorrectly uses a shared lock, this condition 5931 * should still be caught when the stacked filesystem 5932 * invokes VOP_FSYNC() on the underlying filesystem. 5933 */ 5934 else if (MNT_SHARED_WRITES(vp->v_mount)) 5935 ASSERT_VOP_LOCKED(vp, name); 5936 else 5937 ASSERT_VOP_ELOCKED(vp, name); 5938 } 5939 5940 void 5941 vop_fsync_debugpre(void *a) 5942 { 5943 struct vop_fsync_args *ap; 5944 5945 ap = a; 5946 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5947 } 5948 5949 void 5950 vop_fsync_debugpost(void *a, int rc __unused) 5951 { 5952 struct vop_fsync_args *ap; 5953 5954 ap = a; 5955 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5956 } 5957 5958 void 5959 vop_fdatasync_debugpre(void *a) 5960 { 5961 struct vop_fdatasync_args *ap; 5962 5963 ap = a; 5964 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5965 } 5966 5967 void 5968 vop_fdatasync_debugpost(void *a, int rc __unused) 5969 { 5970 struct vop_fdatasync_args *ap; 5971 5972 ap = a; 5973 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5974 } 5975 5976 void 5977 vop_strategy_debugpre(void *ap) 5978 { 5979 struct vop_strategy_args *a; 5980 struct buf *bp; 5981 5982 a = ap; 5983 bp = a->a_bp; 5984 5985 /* 5986 * Cluster ops lock their component buffers but not the IO container. 5987 */ 5988 if ((bp->b_flags & B_CLUSTER) != 0) 5989 return; 5990 5991 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 5992 if (vfs_badlock_print) 5993 printf( 5994 "VOP_STRATEGY: bp is not locked but should be\n"); 5995 if (vfs_badlock_ddb) 5996 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5997 } 5998 } 5999 6000 void 6001 vop_lock_debugpre(void *ap) 6002 { 6003 struct vop_lock1_args *a = ap; 6004 6005 if ((a->a_flags & LK_INTERLOCK) == 0) 6006 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 6007 else 6008 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 6009 } 6010 6011 void 6012 vop_lock_debugpost(void *ap, int rc) 6013 { 6014 struct vop_lock1_args *a = ap; 6015 6016 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 6017 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 6018 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 6019 } 6020 6021 void 6022 vop_unlock_debugpre(void *ap) 6023 { 6024 struct vop_unlock_args *a = ap; 6025 struct vnode *vp = a->a_vp; 6026 6027 VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp); 6028 ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK"); 6029 } 6030 6031 void 6032 vop_need_inactive_debugpre(void *ap) 6033 { 6034 struct vop_need_inactive_args *a = ap; 6035 6036 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 6037 } 6038 6039 void 6040 vop_need_inactive_debugpost(void *ap, int rc) 6041 { 6042 struct vop_need_inactive_args *a = ap; 6043 6044 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 6045 } 6046 #endif 6047 6048 void 6049 vop_create_pre(void *ap) 6050 { 6051 struct vop_create_args *a; 6052 struct vnode *dvp; 6053 6054 a = ap; 6055 dvp = a->a_dvp; 6056 vn_seqc_write_begin(dvp); 6057 } 6058 6059 void 6060 vop_create_post(void *ap, int rc) 6061 { 6062 struct vop_create_args *a; 6063 struct vnode *dvp; 6064 6065 a = ap; 6066 dvp = a->a_dvp; 6067 vn_seqc_write_end(dvp); 6068 if (!rc) 6069 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6070 } 6071 6072 void 6073 vop_whiteout_pre(void *ap) 6074 { 6075 struct vop_whiteout_args *a; 6076 struct vnode *dvp; 6077 6078 a = ap; 6079 dvp = a->a_dvp; 6080 vn_seqc_write_begin(dvp); 6081 } 6082 6083 void 6084 vop_whiteout_post(void *ap, int rc) 6085 { 6086 struct vop_whiteout_args *a; 6087 struct vnode *dvp; 6088 6089 a = ap; 6090 dvp = a->a_dvp; 6091 vn_seqc_write_end(dvp); 6092 } 6093 6094 void 6095 vop_deleteextattr_pre(void *ap) 6096 { 6097 struct vop_deleteextattr_args *a; 6098 struct vnode *vp; 6099 6100 a = ap; 6101 vp = a->a_vp; 6102 vn_seqc_write_begin(vp); 6103 } 6104 6105 void 6106 vop_deleteextattr_post(void *ap, int rc) 6107 { 6108 struct vop_deleteextattr_args *a; 6109 struct vnode *vp; 6110 6111 a = ap; 6112 vp = a->a_vp; 6113 vn_seqc_write_end(vp); 6114 if (!rc) 6115 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 6116 } 6117 6118 void 6119 vop_link_pre(void *ap) 6120 { 6121 struct vop_link_args *a; 6122 struct vnode *vp, *tdvp; 6123 6124 a = ap; 6125 vp = a->a_vp; 6126 tdvp = a->a_tdvp; 6127 vn_seqc_write_begin(vp); 6128 vn_seqc_write_begin(tdvp); 6129 } 6130 6131 void 6132 vop_link_post(void *ap, int rc) 6133 { 6134 struct vop_link_args *a; 6135 struct vnode *vp, *tdvp; 6136 6137 a = ap; 6138 vp = a->a_vp; 6139 tdvp = a->a_tdvp; 6140 vn_seqc_write_end(vp); 6141 vn_seqc_write_end(tdvp); 6142 if (!rc) { 6143 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 6144 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 6145 } 6146 } 6147 6148 void 6149 vop_mkdir_pre(void *ap) 6150 { 6151 struct vop_mkdir_args *a; 6152 struct vnode *dvp; 6153 6154 a = ap; 6155 dvp = a->a_dvp; 6156 vn_seqc_write_begin(dvp); 6157 } 6158 6159 void 6160 vop_mkdir_post(void *ap, int rc) 6161 { 6162 struct vop_mkdir_args *a; 6163 struct vnode *dvp; 6164 6165 a = ap; 6166 dvp = a->a_dvp; 6167 vn_seqc_write_end(dvp); 6168 if (!rc) 6169 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6170 } 6171 6172 #ifdef DEBUG_VFS_LOCKS 6173 void 6174 vop_mkdir_debugpost(void *ap, int rc) 6175 { 6176 struct vop_mkdir_args *a; 6177 6178 a = ap; 6179 if (!rc) 6180 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); 6181 } 6182 #endif 6183 6184 void 6185 vop_mknod_pre(void *ap) 6186 { 6187 struct vop_mknod_args *a; 6188 struct vnode *dvp; 6189 6190 a = ap; 6191 dvp = a->a_dvp; 6192 vn_seqc_write_begin(dvp); 6193 } 6194 6195 void 6196 vop_mknod_post(void *ap, int rc) 6197 { 6198 struct vop_mknod_args *a; 6199 struct vnode *dvp; 6200 6201 a = ap; 6202 dvp = a->a_dvp; 6203 vn_seqc_write_end(dvp); 6204 if (!rc) 6205 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6206 } 6207 6208 void 6209 vop_reclaim_post(void *ap, int rc) 6210 { 6211 struct vop_reclaim_args *a; 6212 struct vnode *vp; 6213 6214 a = ap; 6215 vp = a->a_vp; 6216 ASSERT_VOP_IN_SEQC(vp); 6217 if (!rc) 6218 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 6219 } 6220 6221 void 6222 vop_remove_pre(void *ap) 6223 { 6224 struct vop_remove_args *a; 6225 struct vnode *dvp, *vp; 6226 6227 a = ap; 6228 dvp = a->a_dvp; 6229 vp = a->a_vp; 6230 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); 6231 vn_seqc_write_begin(dvp); 6232 vn_seqc_write_begin(vp); 6233 } 6234 6235 void 6236 vop_remove_post(void *ap, int rc) 6237 { 6238 struct vop_remove_args *a; 6239 struct vnode *dvp, *vp; 6240 6241 a = ap; 6242 dvp = a->a_dvp; 6243 vp = a->a_vp; 6244 vn_seqc_write_end(dvp); 6245 vn_seqc_write_end(vp); 6246 if (!rc) { 6247 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6248 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6249 } 6250 } 6251 6252 void 6253 vop_rename_post(void *ap, int rc) 6254 { 6255 struct vop_rename_args *a = ap; 6256 long hint; 6257 6258 if (!rc) { 6259 hint = NOTE_WRITE; 6260 if (a->a_fdvp == a->a_tdvp) { 6261 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 6262 hint |= NOTE_LINK; 6263 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 6264 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 6265 } else { 6266 hint |= NOTE_EXTEND; 6267 if (a->a_fvp->v_type == VDIR) 6268 hint |= NOTE_LINK; 6269 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 6270 6271 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 6272 a->a_tvp->v_type == VDIR) 6273 hint &= ~NOTE_LINK; 6274 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 6275 } 6276 6277 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 6278 if (a->a_tvp) 6279 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 6280 } 6281 if (a->a_tdvp != a->a_fdvp) 6282 vdrop(a->a_fdvp); 6283 if (a->a_tvp != a->a_fvp) 6284 vdrop(a->a_fvp); 6285 vdrop(a->a_tdvp); 6286 if (a->a_tvp) 6287 vdrop(a->a_tvp); 6288 } 6289 6290 void 6291 vop_rmdir_pre(void *ap) 6292 { 6293 struct vop_rmdir_args *a; 6294 struct vnode *dvp, *vp; 6295 6296 a = ap; 6297 dvp = a->a_dvp; 6298 vp = a->a_vp; 6299 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); 6300 vn_seqc_write_begin(dvp); 6301 vn_seqc_write_begin(vp); 6302 } 6303 6304 void 6305 vop_rmdir_post(void *ap, int rc) 6306 { 6307 struct vop_rmdir_args *a; 6308 struct vnode *dvp, *vp; 6309 6310 a = ap; 6311 dvp = a->a_dvp; 6312 vp = a->a_vp; 6313 vn_seqc_write_end(dvp); 6314 vn_seqc_write_end(vp); 6315 if (!rc) { 6316 vp->v_vflag |= VV_UNLINKED; 6317 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6318 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6319 } 6320 } 6321 6322 void 6323 vop_setattr_pre(void *ap) 6324 { 6325 struct vop_setattr_args *a; 6326 struct vnode *vp; 6327 6328 a = ap; 6329 vp = a->a_vp; 6330 vn_seqc_write_begin(vp); 6331 } 6332 6333 void 6334 vop_setattr_post(void *ap, int rc) 6335 { 6336 struct vop_setattr_args *a; 6337 struct vnode *vp; 6338 6339 a = ap; 6340 vp = a->a_vp; 6341 vn_seqc_write_end(vp); 6342 if (!rc) 6343 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6344 } 6345 6346 void 6347 vop_setacl_pre(void *ap) 6348 { 6349 struct vop_setacl_args *a; 6350 struct vnode *vp; 6351 6352 a = ap; 6353 vp = a->a_vp; 6354 vn_seqc_write_begin(vp); 6355 } 6356 6357 void 6358 vop_setacl_post(void *ap, int rc __unused) 6359 { 6360 struct vop_setacl_args *a; 6361 struct vnode *vp; 6362 6363 a = ap; 6364 vp = a->a_vp; 6365 vn_seqc_write_end(vp); 6366 } 6367 6368 void 6369 vop_setextattr_pre(void *ap) 6370 { 6371 struct vop_setextattr_args *a; 6372 struct vnode *vp; 6373 6374 a = ap; 6375 vp = a->a_vp; 6376 vn_seqc_write_begin(vp); 6377 } 6378 6379 void 6380 vop_setextattr_post(void *ap, int rc) 6381 { 6382 struct vop_setextattr_args *a; 6383 struct vnode *vp; 6384 6385 a = ap; 6386 vp = a->a_vp; 6387 vn_seqc_write_end(vp); 6388 if (!rc) 6389 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6390 } 6391 6392 void 6393 vop_symlink_pre(void *ap) 6394 { 6395 struct vop_symlink_args *a; 6396 struct vnode *dvp; 6397 6398 a = ap; 6399 dvp = a->a_dvp; 6400 vn_seqc_write_begin(dvp); 6401 } 6402 6403 void 6404 vop_symlink_post(void *ap, int rc) 6405 { 6406 struct vop_symlink_args *a; 6407 struct vnode *dvp; 6408 6409 a = ap; 6410 dvp = a->a_dvp; 6411 vn_seqc_write_end(dvp); 6412 if (!rc) 6413 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6414 } 6415 6416 void 6417 vop_open_post(void *ap, int rc) 6418 { 6419 struct vop_open_args *a = ap; 6420 6421 if (!rc) 6422 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6423 } 6424 6425 void 6426 vop_close_post(void *ap, int rc) 6427 { 6428 struct vop_close_args *a = ap; 6429 6430 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6431 !VN_IS_DOOMED(a->a_vp))) { 6432 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6433 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6434 } 6435 } 6436 6437 void 6438 vop_read_post(void *ap, int rc) 6439 { 6440 struct vop_read_args *a = ap; 6441 6442 if (!rc) 6443 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6444 } 6445 6446 void 6447 vop_read_pgcache_post(void *ap, int rc) 6448 { 6449 struct vop_read_pgcache_args *a = ap; 6450 6451 if (!rc) 6452 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); 6453 } 6454 6455 void 6456 vop_readdir_post(void *ap, int rc) 6457 { 6458 struct vop_readdir_args *a = ap; 6459 6460 if (!rc) 6461 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6462 } 6463 6464 static struct knlist fs_knlist; 6465 6466 static void 6467 vfs_event_init(void *arg) 6468 { 6469 knlist_init_mtx(&fs_knlist, NULL); 6470 } 6471 /* XXX - correct order? */ 6472 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6473 6474 void 6475 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6476 { 6477 6478 KNOTE_UNLOCKED(&fs_knlist, event); 6479 } 6480 6481 static int filt_fsattach(struct knote *kn); 6482 static void filt_fsdetach(struct knote *kn); 6483 static int filt_fsevent(struct knote *kn, long hint); 6484 6485 const struct filterops fs_filtops = { 6486 .f_isfd = 0, 6487 .f_attach = filt_fsattach, 6488 .f_detach = filt_fsdetach, 6489 .f_event = filt_fsevent, 6490 }; 6491 6492 static int 6493 filt_fsattach(struct knote *kn) 6494 { 6495 6496 kn->kn_flags |= EV_CLEAR; 6497 knlist_add(&fs_knlist, kn, 0); 6498 return (0); 6499 } 6500 6501 static void 6502 filt_fsdetach(struct knote *kn) 6503 { 6504 6505 knlist_remove(&fs_knlist, kn, 0); 6506 } 6507 6508 static int 6509 filt_fsevent(struct knote *kn, long hint) 6510 { 6511 6512 kn->kn_fflags |= kn->kn_sfflags & hint; 6513 6514 return (kn->kn_fflags != 0); 6515 } 6516 6517 static int 6518 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6519 { 6520 struct vfsidctl vc; 6521 int error; 6522 struct mount *mp; 6523 6524 if (req->newptr == NULL) 6525 return (EINVAL); 6526 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6527 if (error) 6528 return (error); 6529 if (vc.vc_vers != VFS_CTL_VERS1) 6530 return (EINVAL); 6531 mp = vfs_getvfs(&vc.vc_fsid); 6532 if (mp == NULL) 6533 return (ENOENT); 6534 /* ensure that a specific sysctl goes to the right filesystem. */ 6535 if (strcmp(vc.vc_fstypename, "*") != 0 && 6536 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6537 vfs_rel(mp); 6538 return (EINVAL); 6539 } 6540 VCTLTOREQ(&vc, req); 6541 error = VFS_SYSCTL(mp, vc.vc_op, req); 6542 vfs_rel(mp); 6543 return (error); 6544 } 6545 6546 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6547 NULL, 0, sysctl_vfs_ctl, "", 6548 "Sysctl by fsid"); 6549 6550 /* 6551 * Function to initialize a va_filerev field sensibly. 6552 * XXX: Wouldn't a random number make a lot more sense ?? 6553 */ 6554 u_quad_t 6555 init_va_filerev(void) 6556 { 6557 struct bintime bt; 6558 6559 getbinuptime(&bt); 6560 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6561 } 6562 6563 static int filt_vfsread(struct knote *kn, long hint); 6564 static int filt_vfswrite(struct knote *kn, long hint); 6565 static int filt_vfsvnode(struct knote *kn, long hint); 6566 static void filt_vfsdetach(struct knote *kn); 6567 static int filt_vfsdump(struct proc *p, struct knote *kn, 6568 struct kinfo_knote *kin); 6569 6570 static const struct filterops vfsread_filtops = { 6571 .f_isfd = 1, 6572 .f_detach = filt_vfsdetach, 6573 .f_event = filt_vfsread, 6574 .f_userdump = filt_vfsdump, 6575 }; 6576 static const struct filterops vfswrite_filtops = { 6577 .f_isfd = 1, 6578 .f_detach = filt_vfsdetach, 6579 .f_event = filt_vfswrite, 6580 .f_userdump = filt_vfsdump, 6581 }; 6582 static const struct filterops vfsvnode_filtops = { 6583 .f_isfd = 1, 6584 .f_detach = filt_vfsdetach, 6585 .f_event = filt_vfsvnode, 6586 .f_userdump = filt_vfsdump, 6587 }; 6588 6589 static void 6590 vfs_knllock(void *arg) 6591 { 6592 struct vnode *vp = arg; 6593 6594 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6595 } 6596 6597 static void 6598 vfs_knlunlock(void *arg) 6599 { 6600 struct vnode *vp = arg; 6601 6602 VOP_UNLOCK(vp); 6603 } 6604 6605 static void 6606 vfs_knl_assert_lock(void *arg, int what) 6607 { 6608 #ifdef DEBUG_VFS_LOCKS 6609 struct vnode *vp = arg; 6610 6611 if (what == LA_LOCKED) 6612 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6613 else 6614 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6615 #endif 6616 } 6617 6618 int 6619 vfs_kqfilter(struct vop_kqfilter_args *ap) 6620 { 6621 struct vnode *vp = ap->a_vp; 6622 struct knote *kn = ap->a_kn; 6623 struct knlist *knl; 6624 6625 KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && 6626 kn->kn_filter != EVFILT_WRITE), 6627 ("READ/WRITE filter on a FIFO leaked through")); 6628 switch (kn->kn_filter) { 6629 case EVFILT_READ: 6630 kn->kn_fop = &vfsread_filtops; 6631 break; 6632 case EVFILT_WRITE: 6633 kn->kn_fop = &vfswrite_filtops; 6634 break; 6635 case EVFILT_VNODE: 6636 kn->kn_fop = &vfsvnode_filtops; 6637 break; 6638 default: 6639 return (EINVAL); 6640 } 6641 6642 kn->kn_hook = (caddr_t)vp; 6643 6644 v_addpollinfo(vp); 6645 if (vp->v_pollinfo == NULL) 6646 return (ENOMEM); 6647 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6648 vhold(vp); 6649 knlist_add(knl, kn, 0); 6650 6651 return (0); 6652 } 6653 6654 /* 6655 * Detach knote from vnode 6656 */ 6657 static void 6658 filt_vfsdetach(struct knote *kn) 6659 { 6660 struct vnode *vp = (struct vnode *)kn->kn_hook; 6661 6662 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6663 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6664 vdrop(vp); 6665 } 6666 6667 /*ARGSUSED*/ 6668 static int 6669 filt_vfsread(struct knote *kn, long hint) 6670 { 6671 struct vnode *vp = (struct vnode *)kn->kn_hook; 6672 off_t size; 6673 int res; 6674 6675 /* 6676 * filesystem is gone, so set the EOF flag and schedule 6677 * the knote for deletion. 6678 */ 6679 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6680 VI_LOCK(vp); 6681 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6682 VI_UNLOCK(vp); 6683 return (1); 6684 } 6685 6686 if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0) 6687 return (0); 6688 6689 VI_LOCK(vp); 6690 kn->kn_data = size - kn->kn_fp->f_offset; 6691 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6692 VI_UNLOCK(vp); 6693 return (res); 6694 } 6695 6696 /*ARGSUSED*/ 6697 static int 6698 filt_vfswrite(struct knote *kn, long hint) 6699 { 6700 struct vnode *vp = (struct vnode *)kn->kn_hook; 6701 6702 VI_LOCK(vp); 6703 6704 /* 6705 * filesystem is gone, so set the EOF flag and schedule 6706 * the knote for deletion. 6707 */ 6708 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6709 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6710 6711 kn->kn_data = 0; 6712 VI_UNLOCK(vp); 6713 return (1); 6714 } 6715 6716 static int 6717 filt_vfsvnode(struct knote *kn, long hint) 6718 { 6719 struct vnode *vp = (struct vnode *)kn->kn_hook; 6720 int res; 6721 6722 VI_LOCK(vp); 6723 if (kn->kn_sfflags & hint) 6724 kn->kn_fflags |= hint; 6725 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6726 kn->kn_flags |= EV_EOF; 6727 VI_UNLOCK(vp); 6728 return (1); 6729 } 6730 res = (kn->kn_fflags != 0); 6731 VI_UNLOCK(vp); 6732 return (res); 6733 } 6734 6735 static int 6736 filt_vfsdump(struct proc *p, struct knote *kn, struct kinfo_knote *kin) 6737 { 6738 struct vattr va; 6739 struct vnode *vp; 6740 char *fullpath, *freepath; 6741 int error; 6742 6743 kin->knt_extdata = KNOTE_EXTDATA_VNODE; 6744 6745 vp = kn->kn_fp->f_vnode; 6746 kin->knt_vnode.knt_vnode_type = vntype_to_kinfo(vp->v_type); 6747 6748 va.va_fsid = VNOVAL; 6749 vn_lock(vp, LK_SHARED | LK_RETRY); 6750 error = VOP_GETATTR(vp, &va, curthread->td_ucred); 6751 VOP_UNLOCK(vp); 6752 if (error != 0) 6753 return (error); 6754 kin->knt_vnode.knt_vnode_fsid = va.va_fsid; 6755 kin->knt_vnode.knt_vnode_fileid = va.va_fileid; 6756 6757 freepath = NULL; 6758 fullpath = "-"; 6759 error = vn_fullpath(vp, &fullpath, &freepath); 6760 if (error == 0) { 6761 strlcpy(kin->knt_vnode.knt_vnode_fullpath, fullpath, 6762 sizeof(kin->knt_vnode.knt_vnode_fullpath)); 6763 } 6764 if (freepath != NULL) 6765 free(freepath, M_TEMP); 6766 6767 return (0); 6768 } 6769 6770 int 6771 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6772 { 6773 int error; 6774 6775 if (dp->d_reclen > ap->a_uio->uio_resid) 6776 return (ENAMETOOLONG); 6777 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6778 if (error) { 6779 if (ap->a_ncookies != NULL) { 6780 if (ap->a_cookies != NULL) 6781 free(ap->a_cookies, M_TEMP); 6782 ap->a_cookies = NULL; 6783 *ap->a_ncookies = 0; 6784 } 6785 return (error); 6786 } 6787 if (ap->a_ncookies == NULL) 6788 return (0); 6789 6790 KASSERT(ap->a_cookies, 6791 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6792 6793 *ap->a_cookies = realloc(*ap->a_cookies, 6794 (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); 6795 (*ap->a_cookies)[*ap->a_ncookies] = off; 6796 *ap->a_ncookies += 1; 6797 return (0); 6798 } 6799 6800 /* 6801 * The purpose of this routine is to remove granularity from accmode_t, 6802 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6803 * VADMIN and VAPPEND. 6804 * 6805 * If it returns 0, the caller is supposed to continue with the usual 6806 * access checks using 'accmode' as modified by this routine. If it 6807 * returns nonzero value, the caller is supposed to return that value 6808 * as errno. 6809 * 6810 * Note that after this routine runs, accmode may be zero. 6811 */ 6812 int 6813 vfs_unixify_accmode(accmode_t *accmode) 6814 { 6815 /* 6816 * There is no way to specify explicit "deny" rule using 6817 * file mode or POSIX.1e ACLs. 6818 */ 6819 if (*accmode & VEXPLICIT_DENY) { 6820 *accmode = 0; 6821 return (0); 6822 } 6823 6824 /* 6825 * None of these can be translated into usual access bits. 6826 * Also, the common case for NFSv4 ACLs is to not contain 6827 * either of these bits. Caller should check for VWRITE 6828 * on the containing directory instead. 6829 */ 6830 if (*accmode & (VDELETE_CHILD | VDELETE)) 6831 return (EPERM); 6832 6833 if (*accmode & VADMIN_PERMS) { 6834 *accmode &= ~VADMIN_PERMS; 6835 *accmode |= VADMIN; 6836 } 6837 6838 /* 6839 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6840 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6841 */ 6842 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6843 6844 return (0); 6845 } 6846 6847 /* 6848 * Clear out a doomed vnode (if any) and replace it with a new one as long 6849 * as the fs is not being unmounted. Return the root vnode to the caller. 6850 */ 6851 static int __noinline 6852 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6853 { 6854 struct vnode *vp; 6855 int error; 6856 6857 restart: 6858 if (mp->mnt_rootvnode != NULL) { 6859 MNT_ILOCK(mp); 6860 vp = mp->mnt_rootvnode; 6861 if (vp != NULL) { 6862 if (!VN_IS_DOOMED(vp)) { 6863 vrefact(vp); 6864 MNT_IUNLOCK(mp); 6865 error = vn_lock(vp, flags); 6866 if (error == 0) { 6867 *vpp = vp; 6868 return (0); 6869 } 6870 vrele(vp); 6871 goto restart; 6872 } 6873 /* 6874 * Clear the old one. 6875 */ 6876 mp->mnt_rootvnode = NULL; 6877 } 6878 MNT_IUNLOCK(mp); 6879 if (vp != NULL) { 6880 vfs_op_barrier_wait(mp); 6881 vrele(vp); 6882 } 6883 } 6884 error = VFS_CACHEDROOT(mp, flags, vpp); 6885 if (error != 0) 6886 return (error); 6887 if (mp->mnt_vfs_ops == 0) { 6888 MNT_ILOCK(mp); 6889 if (mp->mnt_vfs_ops != 0) { 6890 MNT_IUNLOCK(mp); 6891 return (0); 6892 } 6893 if (mp->mnt_rootvnode == NULL) { 6894 vrefact(*vpp); 6895 mp->mnt_rootvnode = *vpp; 6896 } else { 6897 if (mp->mnt_rootvnode != *vpp) { 6898 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6899 panic("%s: mismatch between vnode returned " 6900 " by VFS_CACHEDROOT and the one cached " 6901 " (%p != %p)", 6902 __func__, *vpp, mp->mnt_rootvnode); 6903 } 6904 } 6905 } 6906 MNT_IUNLOCK(mp); 6907 } 6908 return (0); 6909 } 6910 6911 int 6912 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6913 { 6914 struct mount_pcpu *mpcpu; 6915 struct vnode *vp; 6916 int error; 6917 6918 if (!vfs_op_thread_enter(mp, mpcpu)) 6919 return (vfs_cache_root_fallback(mp, flags, vpp)); 6920 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6921 if (vp == NULL || VN_IS_DOOMED(vp)) { 6922 vfs_op_thread_exit(mp, mpcpu); 6923 return (vfs_cache_root_fallback(mp, flags, vpp)); 6924 } 6925 vrefact(vp); 6926 vfs_op_thread_exit(mp, mpcpu); 6927 error = vn_lock(vp, flags); 6928 if (error != 0) { 6929 vrele(vp); 6930 return (vfs_cache_root_fallback(mp, flags, vpp)); 6931 } 6932 *vpp = vp; 6933 return (0); 6934 } 6935 6936 struct vnode * 6937 vfs_cache_root_clear(struct mount *mp) 6938 { 6939 struct vnode *vp; 6940 6941 /* 6942 * ops > 0 guarantees there is nobody who can see this vnode 6943 */ 6944 MPASS(mp->mnt_vfs_ops > 0); 6945 vp = mp->mnt_rootvnode; 6946 if (vp != NULL) 6947 vn_seqc_write_begin(vp); 6948 mp->mnt_rootvnode = NULL; 6949 return (vp); 6950 } 6951 6952 void 6953 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 6954 { 6955 6956 MPASS(mp->mnt_vfs_ops > 0); 6957 vrefact(vp); 6958 mp->mnt_rootvnode = vp; 6959 } 6960 6961 /* 6962 * These are helper functions for filesystems to traverse all 6963 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 6964 * 6965 * This interface replaces MNT_VNODE_FOREACH. 6966 */ 6967 6968 struct vnode * 6969 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 6970 { 6971 struct vnode *vp; 6972 6973 maybe_yield(); 6974 MNT_ILOCK(mp); 6975 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6976 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 6977 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 6978 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6979 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6980 continue; 6981 VI_LOCK(vp); 6982 if (VN_IS_DOOMED(vp)) { 6983 VI_UNLOCK(vp); 6984 continue; 6985 } 6986 break; 6987 } 6988 if (vp == NULL) { 6989 __mnt_vnode_markerfree_all(mvp, mp); 6990 /* MNT_IUNLOCK(mp); -- done in above function */ 6991 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 6992 return (NULL); 6993 } 6994 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6995 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6996 MNT_IUNLOCK(mp); 6997 return (vp); 6998 } 6999 7000 struct vnode * 7001 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 7002 { 7003 struct vnode *vp; 7004 7005 *mvp = vn_alloc_marker(mp); 7006 MNT_ILOCK(mp); 7007 MNT_REF(mp); 7008 7009 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 7010 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 7011 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 7012 continue; 7013 VI_LOCK(vp); 7014 if (VN_IS_DOOMED(vp)) { 7015 VI_UNLOCK(vp); 7016 continue; 7017 } 7018 break; 7019 } 7020 if (vp == NULL) { 7021 MNT_REL(mp); 7022 MNT_IUNLOCK(mp); 7023 vn_free_marker(*mvp); 7024 *mvp = NULL; 7025 return (NULL); 7026 } 7027 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 7028 MNT_IUNLOCK(mp); 7029 return (vp); 7030 } 7031 7032 void 7033 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 7034 { 7035 7036 if (*mvp == NULL) { 7037 MNT_IUNLOCK(mp); 7038 return; 7039 } 7040 7041 mtx_assert(MNT_MTX(mp), MA_OWNED); 7042 7043 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7044 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 7045 MNT_REL(mp); 7046 MNT_IUNLOCK(mp); 7047 vn_free_marker(*mvp); 7048 *mvp = NULL; 7049 } 7050 7051 /* 7052 * These are helper functions for filesystems to traverse their 7053 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 7054 */ 7055 static void 7056 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 7057 { 7058 7059 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7060 7061 MNT_ILOCK(mp); 7062 MNT_REL(mp); 7063 MNT_IUNLOCK(mp); 7064 vn_free_marker(*mvp); 7065 *mvp = NULL; 7066 } 7067 7068 /* 7069 * Relock the mp mount vnode list lock with the vp vnode interlock in the 7070 * conventional lock order during mnt_vnode_next_lazy iteration. 7071 * 7072 * On entry, the mount vnode list lock is held and the vnode interlock is not. 7073 * The list lock is dropped and reacquired. On success, both locks are held. 7074 * On failure, the mount vnode list lock is held but the vnode interlock is 7075 * not, and the procedure may have yielded. 7076 */ 7077 static bool 7078 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 7079 struct vnode *vp) 7080 { 7081 7082 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 7083 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 7084 ("%s: bad marker", __func__)); 7085 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 7086 ("%s: inappropriate vnode", __func__)); 7087 ASSERT_VI_UNLOCKED(vp, __func__); 7088 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 7089 7090 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 7091 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 7092 7093 /* 7094 * Note we may be racing against vdrop which transitioned the hold 7095 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 7096 * if we are the only user after we get the interlock we will just 7097 * vdrop. 7098 */ 7099 vhold(vp); 7100 mtx_unlock(&mp->mnt_listmtx); 7101 VI_LOCK(vp); 7102 if (VN_IS_DOOMED(vp)) { 7103 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 7104 goto out_lost; 7105 } 7106 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 7107 /* 7108 * There is nothing to do if we are the last user. 7109 */ 7110 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 7111 goto out_lost; 7112 mtx_lock(&mp->mnt_listmtx); 7113 return (true); 7114 out_lost: 7115 vdropl(vp); 7116 maybe_yield(); 7117 mtx_lock(&mp->mnt_listmtx); 7118 return (false); 7119 } 7120 7121 static struct vnode * 7122 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7123 void *cbarg) 7124 { 7125 struct vnode *vp; 7126 7127 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 7128 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7129 restart: 7130 vp = TAILQ_NEXT(*mvp, v_lazylist); 7131 while (vp != NULL) { 7132 if (vp->v_type == VMARKER) { 7133 vp = TAILQ_NEXT(vp, v_lazylist); 7134 continue; 7135 } 7136 /* 7137 * See if we want to process the vnode. Note we may encounter a 7138 * long string of vnodes we don't care about and hog the list 7139 * as a result. Check for it and requeue the marker. 7140 */ 7141 VNPASS(!VN_IS_DOOMED(vp), vp); 7142 if (!cb(vp, cbarg)) { 7143 if (!should_yield()) { 7144 vp = TAILQ_NEXT(vp, v_lazylist); 7145 continue; 7146 } 7147 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 7148 v_lazylist); 7149 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 7150 v_lazylist); 7151 mtx_unlock(&mp->mnt_listmtx); 7152 kern_yield(PRI_USER); 7153 mtx_lock(&mp->mnt_listmtx); 7154 goto restart; 7155 } 7156 /* 7157 * Try-lock because this is the wrong lock order. 7158 */ 7159 if (!VI_TRYLOCK(vp) && 7160 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 7161 goto restart; 7162 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 7163 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 7164 ("alien vnode on the lazy list %p %p", vp, mp)); 7165 VNPASS(vp->v_mount == mp, vp); 7166 VNPASS(!VN_IS_DOOMED(vp), vp); 7167 break; 7168 } 7169 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 7170 7171 /* Check if we are done */ 7172 if (vp == NULL) { 7173 mtx_unlock(&mp->mnt_listmtx); 7174 mnt_vnode_markerfree_lazy(mvp, mp); 7175 return (NULL); 7176 } 7177 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 7178 mtx_unlock(&mp->mnt_listmtx); 7179 ASSERT_VI_LOCKED(vp, "lazy iter"); 7180 return (vp); 7181 } 7182 7183 struct vnode * 7184 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7185 void *cbarg) 7186 { 7187 7188 maybe_yield(); 7189 mtx_lock(&mp->mnt_listmtx); 7190 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 7191 } 7192 7193 struct vnode * 7194 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7195 void *cbarg) 7196 { 7197 struct vnode *vp; 7198 7199 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 7200 return (NULL); 7201 7202 *mvp = vn_alloc_marker(mp); 7203 MNT_ILOCK(mp); 7204 MNT_REF(mp); 7205 MNT_IUNLOCK(mp); 7206 7207 mtx_lock(&mp->mnt_listmtx); 7208 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 7209 if (vp == NULL) { 7210 mtx_unlock(&mp->mnt_listmtx); 7211 mnt_vnode_markerfree_lazy(mvp, mp); 7212 return (NULL); 7213 } 7214 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 7215 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 7216 } 7217 7218 void 7219 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 7220 { 7221 7222 if (*mvp == NULL) 7223 return; 7224 7225 mtx_lock(&mp->mnt_listmtx); 7226 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 7227 mtx_unlock(&mp->mnt_listmtx); 7228 mnt_vnode_markerfree_lazy(mvp, mp); 7229 } 7230 7231 int 7232 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 7233 { 7234 7235 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 7236 cnp->cn_flags &= ~NOEXECCHECK; 7237 return (0); 7238 } 7239 7240 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); 7241 } 7242 7243 /* 7244 * Do not use this variant unless you have means other than the hold count 7245 * to prevent the vnode from getting freed. 7246 */ 7247 void 7248 vn_seqc_write_begin_locked(struct vnode *vp) 7249 { 7250 7251 ASSERT_VI_LOCKED(vp, __func__); 7252 VNPASS(vp->v_holdcnt > 0, vp); 7253 VNPASS(vp->v_seqc_users >= 0, vp); 7254 vp->v_seqc_users++; 7255 if (vp->v_seqc_users == 1) 7256 seqc_sleepable_write_begin(&vp->v_seqc); 7257 } 7258 7259 void 7260 vn_seqc_write_begin(struct vnode *vp) 7261 { 7262 7263 VI_LOCK(vp); 7264 vn_seqc_write_begin_locked(vp); 7265 VI_UNLOCK(vp); 7266 } 7267 7268 void 7269 vn_seqc_write_end_locked(struct vnode *vp) 7270 { 7271 7272 ASSERT_VI_LOCKED(vp, __func__); 7273 VNPASS(vp->v_seqc_users > 0, vp); 7274 vp->v_seqc_users--; 7275 if (vp->v_seqc_users == 0) 7276 seqc_sleepable_write_end(&vp->v_seqc); 7277 } 7278 7279 void 7280 vn_seqc_write_end(struct vnode *vp) 7281 { 7282 7283 VI_LOCK(vp); 7284 vn_seqc_write_end_locked(vp); 7285 VI_UNLOCK(vp); 7286 } 7287 7288 /* 7289 * Special case handling for allocating and freeing vnodes. 7290 * 7291 * The counter remains unchanged on free so that a doomed vnode will 7292 * keep testing as in modify as long as it is accessible with SMR. 7293 */ 7294 static void 7295 vn_seqc_init(struct vnode *vp) 7296 { 7297 7298 vp->v_seqc = 0; 7299 vp->v_seqc_users = 0; 7300 } 7301 7302 static void 7303 vn_seqc_write_end_free(struct vnode *vp) 7304 { 7305 7306 VNPASS(seqc_in_modify(vp->v_seqc), vp); 7307 VNPASS(vp->v_seqc_users == 1, vp); 7308 } 7309 7310 void 7311 vn_irflag_set_locked(struct vnode *vp, short toset) 7312 { 7313 short flags; 7314 7315 ASSERT_VI_LOCKED(vp, __func__); 7316 flags = vn_irflag_read(vp); 7317 VNASSERT((flags & toset) == 0, vp, 7318 ("%s: some of the passed flags already set (have %d, passed %d)\n", 7319 __func__, flags, toset)); 7320 atomic_store_short(&vp->v_irflag, flags | toset); 7321 } 7322 7323 void 7324 vn_irflag_set(struct vnode *vp, short toset) 7325 { 7326 7327 VI_LOCK(vp); 7328 vn_irflag_set_locked(vp, toset); 7329 VI_UNLOCK(vp); 7330 } 7331 7332 void 7333 vn_irflag_set_cond_locked(struct vnode *vp, short toset) 7334 { 7335 short flags; 7336 7337 ASSERT_VI_LOCKED(vp, __func__); 7338 flags = vn_irflag_read(vp); 7339 atomic_store_short(&vp->v_irflag, flags | toset); 7340 } 7341 7342 void 7343 vn_irflag_set_cond(struct vnode *vp, short toset) 7344 { 7345 7346 VI_LOCK(vp); 7347 vn_irflag_set_cond_locked(vp, toset); 7348 VI_UNLOCK(vp); 7349 } 7350 7351 void 7352 vn_irflag_unset_locked(struct vnode *vp, short tounset) 7353 { 7354 short flags; 7355 7356 ASSERT_VI_LOCKED(vp, __func__); 7357 flags = vn_irflag_read(vp); 7358 VNASSERT((flags & tounset) == tounset, vp, 7359 ("%s: some of the passed flags not set (have %d, passed %d)\n", 7360 __func__, flags, tounset)); 7361 atomic_store_short(&vp->v_irflag, flags & ~tounset); 7362 } 7363 7364 void 7365 vn_irflag_unset(struct vnode *vp, short tounset) 7366 { 7367 7368 VI_LOCK(vp); 7369 vn_irflag_unset_locked(vp, tounset); 7370 VI_UNLOCK(vp); 7371 } 7372 7373 int 7374 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred) 7375 { 7376 struct vattr vattr; 7377 int error; 7378 7379 ASSERT_VOP_LOCKED(vp, __func__); 7380 error = VOP_GETATTR(vp, &vattr, cred); 7381 if (__predict_true(error == 0)) { 7382 if (vattr.va_size <= OFF_MAX) 7383 *size = vattr.va_size; 7384 else 7385 error = EFBIG; 7386 } 7387 return (error); 7388 } 7389 7390 int 7391 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred) 7392 { 7393 int error; 7394 7395 VOP_LOCK(vp, LK_SHARED); 7396 error = vn_getsize_locked(vp, size, cred); 7397 VOP_UNLOCK(vp); 7398 return (error); 7399 } 7400 7401 #ifdef INVARIANTS 7402 void 7403 vn_set_state_validate(struct vnode *vp, __enum_uint8(vstate) state) 7404 { 7405 7406 switch (vp->v_state) { 7407 case VSTATE_UNINITIALIZED: 7408 switch (state) { 7409 case VSTATE_CONSTRUCTED: 7410 case VSTATE_DESTROYING: 7411 return; 7412 default: 7413 break; 7414 } 7415 break; 7416 case VSTATE_CONSTRUCTED: 7417 ASSERT_VOP_ELOCKED(vp, __func__); 7418 switch (state) { 7419 case VSTATE_DESTROYING: 7420 return; 7421 default: 7422 break; 7423 } 7424 break; 7425 case VSTATE_DESTROYING: 7426 ASSERT_VOP_ELOCKED(vp, __func__); 7427 switch (state) { 7428 case VSTATE_DEAD: 7429 return; 7430 default: 7431 break; 7432 } 7433 break; 7434 case VSTATE_DEAD: 7435 switch (state) { 7436 case VSTATE_UNINITIALIZED: 7437 return; 7438 default: 7439 break; 7440 } 7441 break; 7442 } 7443 7444 vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state); 7445 panic("invalid state transition %d -> %d\n", vp->v_state, state); 7446 } 7447 #endif 7448