1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_watchdog.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/asan.h> 48 #include <sys/bio.h> 49 #include <sys/buf.h> 50 #include <sys/capsicum.h> 51 #include <sys/condvar.h> 52 #include <sys/conf.h> 53 #include <sys/counter.h> 54 #include <sys/dirent.h> 55 #include <sys/event.h> 56 #include <sys/eventhandler.h> 57 #include <sys/extattr.h> 58 #include <sys/file.h> 59 #include <sys/fcntl.h> 60 #include <sys/jail.h> 61 #include <sys/kdb.h> 62 #include <sys/kernel.h> 63 #include <sys/kthread.h> 64 #include <sys/ktr.h> 65 #include <sys/limits.h> 66 #include <sys/lockf.h> 67 #include <sys/malloc.h> 68 #include <sys/mount.h> 69 #include <sys/namei.h> 70 #include <sys/pctrie.h> 71 #include <sys/priv.h> 72 #include <sys/reboot.h> 73 #include <sys/refcount.h> 74 #include <sys/rwlock.h> 75 #include <sys/sched.h> 76 #include <sys/sleepqueue.h> 77 #include <sys/smr.h> 78 #include <sys/smp.h> 79 #include <sys/stat.h> 80 #include <sys/stdarg.h> 81 #include <sys/sysctl.h> 82 #include <sys/syslog.h> 83 #include <sys/user.h> 84 #include <sys/vmmeter.h> 85 #include <sys/vnode.h> 86 #include <sys/watchdog.h> 87 88 #include <security/mac/mac_framework.h> 89 90 #include <vm/vm.h> 91 #include <vm/vm_object.h> 92 #include <vm/vm_extern.h> 93 #include <vm/pmap.h> 94 #include <vm/vm_map.h> 95 #include <vm/vm_page.h> 96 #include <vm/vm_kern.h> 97 #include <vm/vnode_pager.h> 98 #include <vm/uma.h> 99 100 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS)) 101 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS 102 #endif 103 104 #ifdef DDB 105 #include <ddb/ddb.h> 106 #endif 107 108 static void delmntque(struct vnode *vp); 109 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 110 int slpflag, int slptimeo); 111 static void syncer_shutdown(void *arg, int howto); 112 static int vtryrecycle(struct vnode *vp, bool isvnlru); 113 static void v_init_counters(struct vnode *); 114 static void vn_seqc_init(struct vnode *); 115 static void vn_seqc_write_end_free(struct vnode *vp); 116 static void vgonel(struct vnode *); 117 static bool vhold_recycle_free(struct vnode *); 118 static void vdropl_recycle(struct vnode *vp); 119 static void vdrop_recycle(struct vnode *vp); 120 static void vfs_knllock(void *arg); 121 static void vfs_knlunlock(void *arg); 122 static void vfs_knl_assert_lock(void *arg, int what); 123 static void destroy_vpollinfo(struct vpollinfo *vi); 124 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 125 daddr_t startlbn, daddr_t endlbn); 126 static void vnlru_recalc(void); 127 128 static SYSCTL_NODE(_vfs, OID_AUTO, vnode, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 129 "vnode configuration and statistics"); 130 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 131 "vnode configuration"); 132 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 133 "vnode statistics"); 134 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, vnlru, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 135 "vnode recycling"); 136 137 /* 138 * Number of vnodes in existence. Increased whenever getnewvnode() 139 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 140 */ 141 static u_long __exclusive_cache_line numvnodes; 142 143 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 144 "Number of vnodes in existence (legacy)"); 145 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, count, CTLFLAG_RD, &numvnodes, 0, 146 "Number of vnodes in existence"); 147 148 static counter_u64_t vnodes_created; 149 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 150 "Number of vnodes created by getnewvnode (legacy)"); 151 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, created, CTLFLAG_RD, &vnodes_created, 152 "Number of vnodes created by getnewvnode"); 153 154 /* 155 * Conversion tables for conversion from vnode types to inode formats 156 * and back. 157 */ 158 __enum_uint8(vtype) iftovt_tab[16] = { 159 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 160 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 161 }; 162 int vttoif_tab[10] = { 163 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 164 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 165 }; 166 167 /* 168 * List of allocates vnodes in the system. 169 */ 170 static TAILQ_HEAD(freelst, vnode) vnode_list; 171 static struct vnode *vnode_list_free_marker; 172 static struct vnode *vnode_list_reclaim_marker; 173 174 /* 175 * "Free" vnode target. Free vnodes are rarely completely free, but are 176 * just ones that are cheap to recycle. Usually they are for files which 177 * have been stat'd but not read; these usually have inode and namecache 178 * data attached to them. This target is the preferred minimum size of a 179 * sub-cache consisting mostly of such files. The system balances the size 180 * of this sub-cache with its complement to try to prevent either from 181 * thrashing while the other is relatively inactive. The targets express 182 * a preference for the best balance. 183 * 184 * "Above" this target there are 2 further targets (watermarks) related 185 * to recyling of free vnodes. In the best-operating case, the cache is 186 * exactly full, the free list has size between vlowat and vhiwat above the 187 * free target, and recycling from it and normal use maintains this state. 188 * Sometimes the free list is below vlowat or even empty, but this state 189 * is even better for immediate use provided the cache is not full. 190 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 191 * ones) to reach one of these states. The watermarks are currently hard- 192 * coded as 4% and 9% of the available space higher. These and the default 193 * of 25% for wantfreevnodes are too large if the memory size is large. 194 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 195 * whenever vnlru_proc() becomes active. 196 */ 197 static long wantfreevnodes; 198 static long __exclusive_cache_line freevnodes; 199 static long freevnodes_old; 200 201 static u_long recycles_count; 202 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, &recycles_count, 0, 203 "Number of vnodes recycled to meet vnode cache targets (legacy)"); 204 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, 205 &recycles_count, 0, 206 "Number of vnodes recycled to meet vnode cache targets"); 207 208 static u_long recycles_free_count; 209 SYSCTL_ULONG(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, 210 &recycles_free_count, 0, 211 "Number of free vnodes recycled to meet vnode cache targets (legacy)"); 212 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, 213 &recycles_free_count, 0, 214 "Number of free vnodes recycled to meet vnode cache targets"); 215 216 static counter_u64_t direct_recycles_free_count; 217 SYSCTL_COUNTER_U64(_vfs_vnode_vnlru, OID_AUTO, direct_recycles_free, CTLFLAG_RD, 218 &direct_recycles_free_count, 219 "Number of free vnodes recycled by vn_alloc callers to meet vnode cache targets"); 220 221 static counter_u64_t vnode_skipped_requeues; 222 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, skipped_requeues, CTLFLAG_RD, &vnode_skipped_requeues, 223 "Number of times LRU requeue was skipped due to lock contention"); 224 225 static __read_mostly bool vnode_can_skip_requeue; 226 SYSCTL_BOOL(_vfs_vnode_param, OID_AUTO, can_skip_requeue, CTLFLAG_RW, 227 &vnode_can_skip_requeue, 0, "Is LRU requeue skippable"); 228 229 static u_long deferred_inact; 230 SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, 231 &deferred_inact, 0, "Number of times inactive processing was deferred"); 232 233 /* To keep more than one thread at a time from running vfs_getnewfsid */ 234 static struct mtx mntid_mtx; 235 236 /* 237 * Lock for any access to the following: 238 * vnode_list 239 * numvnodes 240 * freevnodes 241 */ 242 static struct mtx __exclusive_cache_line vnode_list_mtx; 243 244 /* Publicly exported FS */ 245 struct nfs_public nfs_pub; 246 247 static uma_zone_t buf_trie_zone; 248 static smr_t buf_trie_smr; 249 250 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 251 static uma_zone_t vnode_zone; 252 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); 253 254 __read_frequently smr_t vfs_smr; 255 256 /* 257 * The workitem queue. 258 * 259 * It is useful to delay writes of file data and filesystem metadata 260 * for tens of seconds so that quickly created and deleted files need 261 * not waste disk bandwidth being created and removed. To realize this, 262 * we append vnodes to a "workitem" queue. When running with a soft 263 * updates implementation, most pending metadata dependencies should 264 * not wait for more than a few seconds. Thus, mounted on block devices 265 * are delayed only about a half the time that file data is delayed. 266 * Similarly, directory updates are more critical, so are only delayed 267 * about a third the time that file data is delayed. Thus, there are 268 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 269 * one each second (driven off the filesystem syncer process). The 270 * syncer_delayno variable indicates the next queue that is to be processed. 271 * Items that need to be processed soon are placed in this queue: 272 * 273 * syncer_workitem_pending[syncer_delayno] 274 * 275 * A delay of fifteen seconds is done by placing the request fifteen 276 * entries later in the queue: 277 * 278 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 279 * 280 */ 281 static int syncer_delayno; 282 static long syncer_mask; 283 LIST_HEAD(synclist, bufobj); 284 static struct synclist *syncer_workitem_pending; 285 /* 286 * The sync_mtx protects: 287 * bo->bo_synclist 288 * sync_vnode_count 289 * syncer_delayno 290 * syncer_state 291 * syncer_workitem_pending 292 * syncer_worklist_len 293 * rushjob 294 */ 295 static struct mtx sync_mtx; 296 static struct cv sync_wakeup; 297 298 #define SYNCER_MAXDELAY 32 299 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 300 static int syncdelay = 30; /* max time to delay syncing data */ 301 static int filedelay = 30; /* time to delay syncing files */ 302 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 303 "Time to delay syncing files (in seconds)"); 304 static int dirdelay = 29; /* time to delay syncing directories */ 305 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 306 "Time to delay syncing directories (in seconds)"); 307 static int metadelay = 28; /* time to delay syncing metadata */ 308 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 309 "Time to delay syncing metadata (in seconds)"); 310 static int rushjob; /* number of slots to run ASAP */ 311 static int stat_rush_requests; /* number of times I/O speeded up */ 312 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 313 "Number of times I/O speeded up (rush requests)"); 314 315 #define VDBATCH_SIZE 8 316 struct vdbatch { 317 u_int index; 318 struct mtx lock; 319 struct vnode *tab[VDBATCH_SIZE]; 320 }; 321 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 322 323 static void vdbatch_dequeue(struct vnode *vp); 324 325 /* 326 * The syncer will require at least SYNCER_MAXDELAY iterations to shutdown; 327 * we probably don't want to pause for the whole second each time. 328 */ 329 #define SYNCER_SHUTDOWN_SPEEDUP 32 330 static int sync_vnode_count; 331 static int syncer_worklist_len; 332 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 333 syncer_state; 334 335 /* Target for maximum number of vnodes. */ 336 u_long desiredvnodes; 337 static u_long gapvnodes; /* gap between wanted and desired */ 338 static u_long vhiwat; /* enough extras after expansion */ 339 static u_long vlowat; /* minimal extras before expansion */ 340 static bool vstir; /* nonzero to stir non-free vnodes */ 341 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 342 343 static u_long vnlru_read_freevnodes(void); 344 345 /* 346 * Note that no attempt is made to sanitize these parameters. 347 */ 348 static int 349 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 350 { 351 u_long val; 352 int error; 353 354 val = desiredvnodes; 355 error = sysctl_handle_long(oidp, &val, 0, req); 356 if (error != 0 || req->newptr == NULL) 357 return (error); 358 359 if (val == desiredvnodes) 360 return (0); 361 mtx_lock(&vnode_list_mtx); 362 desiredvnodes = val; 363 wantfreevnodes = desiredvnodes / 4; 364 vnlru_recalc(); 365 mtx_unlock(&vnode_list_mtx); 366 /* 367 * XXX There is no protection against multiple threads changing 368 * desiredvnodes at the same time. Locking above only helps vnlru and 369 * getnewvnode. 370 */ 371 vfs_hash_changesize(desiredvnodes); 372 cache_changesize(desiredvnodes); 373 return (0); 374 } 375 376 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 377 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 378 "LU", "Target for maximum number of vnodes (legacy)"); 379 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, limit, 380 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 381 "LU", "Target for maximum number of vnodes"); 382 383 static int 384 sysctl_freevnodes(SYSCTL_HANDLER_ARGS) 385 { 386 u_long rfreevnodes; 387 388 rfreevnodes = vnlru_read_freevnodes(); 389 return (sysctl_handle_long(oidp, &rfreevnodes, 0, req)); 390 } 391 392 SYSCTL_PROC(_vfs, OID_AUTO, freevnodes, 393 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, 394 "LU", "Number of \"free\" vnodes (legacy)"); 395 SYSCTL_PROC(_vfs_vnode_stats, OID_AUTO, free, 396 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, 397 "LU", "Number of \"free\" vnodes"); 398 399 static int 400 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 401 { 402 u_long val; 403 int error; 404 405 val = wantfreevnodes; 406 error = sysctl_handle_long(oidp, &val, 0, req); 407 if (error != 0 || req->newptr == NULL) 408 return (error); 409 410 if (val == wantfreevnodes) 411 return (0); 412 mtx_lock(&vnode_list_mtx); 413 wantfreevnodes = val; 414 vnlru_recalc(); 415 mtx_unlock(&vnode_list_mtx); 416 return (0); 417 } 418 419 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 420 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 421 "LU", "Target for minimum number of \"free\" vnodes (legacy)"); 422 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, wantfree, 423 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 424 "LU", "Target for minimum number of \"free\" vnodes"); 425 426 static int vnlru_nowhere; 427 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, failed_runs, CTLFLAG_RD | CTLFLAG_STATS, 428 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 429 430 static int 431 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 432 { 433 struct vnode *vp; 434 struct nameidata nd; 435 char *buf; 436 unsigned long ndflags; 437 int error; 438 439 if (req->newptr == NULL) 440 return (EINVAL); 441 if (req->newlen >= PATH_MAX) 442 return (E2BIG); 443 444 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 445 error = SYSCTL_IN(req, buf, req->newlen); 446 if (error != 0) 447 goto out; 448 449 buf[req->newlen] = '\0'; 450 451 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; 452 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); 453 if ((error = namei(&nd)) != 0) 454 goto out; 455 vp = nd.ni_vp; 456 457 if (VN_IS_DOOMED(vp)) { 458 /* 459 * This vnode is being recycled. Return != 0 to let the caller 460 * know that the sysctl had no effect. Return EAGAIN because a 461 * subsequent call will likely succeed (since namei will create 462 * a new vnode if necessary) 463 */ 464 error = EAGAIN; 465 goto putvnode; 466 } 467 468 vgone(vp); 469 putvnode: 470 vput(vp); 471 NDFREE_PNBUF(&nd); 472 out: 473 free(buf, M_TEMP); 474 return (error); 475 } 476 477 static int 478 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 479 { 480 struct thread *td = curthread; 481 struct vnode *vp; 482 struct file *fp; 483 int error; 484 int fd; 485 486 if (req->newptr == NULL) 487 return (EBADF); 488 489 error = sysctl_handle_int(oidp, &fd, 0, req); 490 if (error != 0) 491 return (error); 492 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 493 if (error != 0) 494 return (error); 495 vp = fp->f_vnode; 496 497 error = vn_lock(vp, LK_EXCLUSIVE); 498 if (error != 0) 499 goto drop; 500 501 vgone(vp); 502 VOP_UNLOCK(vp); 503 drop: 504 fdrop(fp, td); 505 return (error); 506 } 507 508 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 509 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 510 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 511 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 512 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 513 sysctl_ftry_reclaim_vnode, "I", 514 "Try to reclaim a vnode by its file descriptor"); 515 516 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 517 #define vnsz2log 8 518 #ifndef DEBUG_LOCKS 519 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && 520 sizeof(struct vnode) < 1UL << (vnsz2log + 1), 521 "vnsz2log needs to be updated"); 522 #endif 523 524 /* 525 * Support for the bufobj clean & dirty pctrie. 526 */ 527 static void * 528 buf_trie_alloc(struct pctrie *ptree) 529 { 530 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 531 } 532 533 static void 534 buf_trie_free(struct pctrie *ptree, void *node) 535 { 536 uma_zfree_smr(buf_trie_zone, node); 537 } 538 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 539 buf_trie_smr); 540 541 /* 542 * Lookup the next element greater than or equal to lblkno, accounting for the 543 * fact that, for pctries, negative values are greater than nonnegative ones. 544 */ 545 static struct buf * 546 buf_lookup_ge(struct bufv *bv, daddr_t lblkno) 547 { 548 struct buf *bp; 549 550 bp = BUF_PCTRIE_LOOKUP_GE(&bv->bv_root, lblkno); 551 if (bp == NULL && lblkno < 0) 552 bp = BUF_PCTRIE_LOOKUP_GE(&bv->bv_root, 0); 553 if (bp != NULL && bp->b_lblkno < lblkno) 554 bp = NULL; 555 return (bp); 556 } 557 558 /* 559 * Insert bp, and find the next element smaller than bp, accounting for the fact 560 * that, for pctries, negative values are greater than nonnegative ones. 561 */ 562 static int 563 buf_insert_lookup_le(struct bufv *bv, struct buf *bp, struct buf **n) 564 { 565 int error; 566 567 error = BUF_PCTRIE_INSERT_LOOKUP_LE(&bv->bv_root, bp, n); 568 if (error != EEXIST) { 569 if (*n == NULL && bp->b_lblkno >= 0) 570 *n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, ~0L); 571 if (*n != NULL && (*n)->b_lblkno >= bp->b_lblkno) 572 *n = NULL; 573 } 574 return (error); 575 } 576 577 /* 578 * Initialize the vnode management data structures. 579 * 580 * Reevaluate the following cap on the number of vnodes after the physical 581 * memory size exceeds 512GB. In the limit, as the physical memory size 582 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 583 */ 584 #ifndef MAXVNODES_MAX 585 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 586 #endif 587 588 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 589 590 static struct vnode * 591 vn_alloc_marker(struct mount *mp) 592 { 593 struct vnode *vp; 594 595 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 596 vp->v_type = VMARKER; 597 vp->v_mount = mp; 598 599 return (vp); 600 } 601 602 static void 603 vn_free_marker(struct vnode *vp) 604 { 605 606 MPASS(vp->v_type == VMARKER); 607 free(vp, M_VNODE_MARKER); 608 } 609 610 #ifdef KASAN 611 static int 612 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) 613 { 614 kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); 615 return (0); 616 } 617 618 static void 619 vnode_dtor(void *mem, int size, void *arg __unused) 620 { 621 size_t end1, end2, off1, off2; 622 623 _Static_assert(offsetof(struct vnode, v_vnodelist) < 624 offsetof(struct vnode, v_dbatchcpu), 625 "KASAN marks require updating"); 626 627 off1 = offsetof(struct vnode, v_vnodelist); 628 off2 = offsetof(struct vnode, v_dbatchcpu); 629 end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); 630 end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); 631 632 /* 633 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even 634 * after the vnode has been freed. Try to get some KASAN coverage by 635 * marking everything except those two fields as invalid. Because 636 * KASAN's tracking is not byte-granular, any preceding fields sharing 637 * the same 8-byte aligned word must also be marked valid. 638 */ 639 640 /* Handle the area from the start until v_vnodelist... */ 641 off1 = rounddown2(off1, KASAN_SHADOW_SCALE); 642 kasan_mark(mem, off1, off1, KASAN_UMA_FREED); 643 644 /* ... then the area between v_vnodelist and v_dbatchcpu ... */ 645 off1 = roundup2(end1, KASAN_SHADOW_SCALE); 646 off2 = rounddown2(off2, KASAN_SHADOW_SCALE); 647 if (off2 > off1) 648 kasan_mark((void *)((char *)mem + off1), off2 - off1, 649 off2 - off1, KASAN_UMA_FREED); 650 651 /* ... and finally the area from v_dbatchcpu to the end. */ 652 off2 = roundup2(end2, KASAN_SHADOW_SCALE); 653 kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, 654 KASAN_UMA_FREED); 655 } 656 #endif /* KASAN */ 657 658 /* 659 * Initialize a vnode as it first enters the zone. 660 */ 661 static int 662 vnode_init(void *mem, int size, int flags) 663 { 664 struct vnode *vp; 665 666 vp = mem; 667 bzero(vp, size); 668 /* 669 * Setup locks. 670 */ 671 vp->v_vnlock = &vp->v_lock; 672 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 673 /* 674 * By default, don't allow shared locks unless filesystems opt-in. 675 */ 676 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 677 LK_NOSHARE | LK_IS_VNODE); 678 /* 679 * Initialize bufobj. 680 */ 681 bufobj_init(&vp->v_bufobj, vp); 682 /* 683 * Initialize namecache. 684 */ 685 cache_vnode_init(vp); 686 /* 687 * Initialize rangelocks. 688 */ 689 rangelock_init(&vp->v_rl); 690 691 vp->v_dbatchcpu = NOCPU; 692 693 vp->v_state = VSTATE_DEAD; 694 695 /* 696 * Check vhold_recycle_free for an explanation. 697 */ 698 vp->v_holdcnt = VHOLD_NO_SMR; 699 vp->v_type = VNON; 700 mtx_lock(&vnode_list_mtx); 701 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 702 mtx_unlock(&vnode_list_mtx); 703 return (0); 704 } 705 706 /* 707 * Free a vnode when it is cleared from the zone. 708 */ 709 static void 710 vnode_fini(void *mem, int size) 711 { 712 struct vnode *vp; 713 struct bufobj *bo; 714 715 vp = mem; 716 vdbatch_dequeue(vp); 717 mtx_lock(&vnode_list_mtx); 718 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 719 mtx_unlock(&vnode_list_mtx); 720 rangelock_destroy(&vp->v_rl); 721 lockdestroy(vp->v_vnlock); 722 mtx_destroy(&vp->v_interlock); 723 bo = &vp->v_bufobj; 724 rw_destroy(BO_LOCKPTR(bo)); 725 726 kasan_mark(mem, size, size, 0); 727 } 728 729 /* 730 * Provide the size of NFS nclnode and NFS fh for calculation of the 731 * vnode memory consumption. The size is specified directly to 732 * eliminate dependency on NFS-private header. 733 * 734 * Other filesystems may use bigger or smaller (like UFS and ZFS) 735 * private inode data, but the NFS-based estimation is ample enough. 736 * Still, we care about differences in the size between 64- and 32-bit 737 * platforms. 738 * 739 * Namecache structure size is heuristically 740 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 741 */ 742 #ifdef _LP64 743 #define NFS_NCLNODE_SZ (528 + 64) 744 #define NC_SZ 148 745 #else 746 #define NFS_NCLNODE_SZ (360 + 32) 747 #define NC_SZ 92 748 #endif 749 750 static void 751 vntblinit(void *dummy __unused) 752 { 753 struct vdbatch *vd; 754 uma_ctor ctor; 755 uma_dtor dtor; 756 int cpu, physvnodes, virtvnodes; 757 758 /* 759 * 'desiredvnodes' is the minimum of a function of the physical memory 760 * size and another of the kernel heap size (UMA limit, a portion of the 761 * KVA). 762 * 763 * Currently, on 64-bit platforms, 'desiredvnodes' is set to 764 * 'virtvnodes' up to a physical memory cutoff of ~1722MB, after which 765 * 'physvnodes' applies instead. With the current automatic tuning for 766 * 'maxfiles' (32 files/MB), 'desiredvnodes' is always greater than it. 767 */ 768 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 32 + 769 min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 32; 770 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 771 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 772 desiredvnodes = min(physvnodes, virtvnodes); 773 if (desiredvnodes > MAXVNODES_MAX) { 774 if (bootverbose) 775 printf("Reducing kern.maxvnodes %lu -> %lu\n", 776 desiredvnodes, MAXVNODES_MAX); 777 desiredvnodes = MAXVNODES_MAX; 778 } 779 wantfreevnodes = desiredvnodes / 4; 780 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 781 TAILQ_INIT(&vnode_list); 782 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 783 /* 784 * The lock is taken to appease WITNESS. 785 */ 786 mtx_lock(&vnode_list_mtx); 787 vnlru_recalc(); 788 mtx_unlock(&vnode_list_mtx); 789 vnode_list_free_marker = vn_alloc_marker(NULL); 790 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 791 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 792 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 793 794 #ifdef KASAN 795 ctor = vnode_ctor; 796 dtor = vnode_dtor; 797 #else 798 ctor = NULL; 799 dtor = NULL; 800 #endif 801 vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, 802 vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); 803 uma_zone_set_smr(vnode_zone, vfs_smr); 804 805 /* 806 * Preallocate enough nodes to support one-per buf so that 807 * we can not fail an insert. reassignbuf() callers can not 808 * tolerate the insertion failure. 809 */ 810 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 811 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 812 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 813 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 814 uma_prealloc(buf_trie_zone, nbuf); 815 816 vnodes_created = counter_u64_alloc(M_WAITOK); 817 direct_recycles_free_count = counter_u64_alloc(M_WAITOK); 818 vnode_skipped_requeues = counter_u64_alloc(M_WAITOK); 819 820 /* 821 * Initialize the filesystem syncer. 822 */ 823 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 824 &syncer_mask); 825 syncer_maxdelay = syncer_mask + 1; 826 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 827 cv_init(&sync_wakeup, "syncer"); 828 829 CPU_FOREACH(cpu) { 830 vd = DPCPU_ID_PTR((cpu), vd); 831 bzero(vd, sizeof(*vd)); 832 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 833 } 834 } 835 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 836 837 /* 838 * Mark a mount point as busy. Used to synchronize access and to delay 839 * unmounting. Eventually, mountlist_mtx is not released on failure. 840 * 841 * vfs_busy() is a custom lock, it can block the caller. 842 * vfs_busy() only sleeps if the unmount is active on the mount point. 843 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 844 * vnode belonging to mp. 845 * 846 * Lookup uses vfs_busy() to traverse mount points. 847 * root fs var fs 848 * / vnode lock A / vnode lock (/var) D 849 * /var vnode lock B /log vnode lock(/var/log) E 850 * vfs_busy lock C vfs_busy lock F 851 * 852 * Within each file system, the lock order is C->A->B and F->D->E. 853 * 854 * When traversing across mounts, the system follows that lock order: 855 * 856 * C->A->B 857 * | 858 * +->F->D->E 859 * 860 * The lookup() process for namei("/var") illustrates the process: 861 * 1. VOP_LOOKUP() obtains B while A is held 862 * 2. vfs_busy() obtains a shared lock on F while A and B are held 863 * 3. vput() releases lock on B 864 * 4. vput() releases lock on A 865 * 5. VFS_ROOT() obtains lock on D while shared lock on F is held 866 * 6. vfs_unbusy() releases shared lock on F 867 * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 868 * Attempt to lock A (instead of vp_crossmp) while D is held would 869 * violate the global order, causing deadlocks. 870 * 871 * dounmount() locks B while F is drained. Note that for stacked 872 * filesystems, D and B in the example above may be the same lock, 873 * which introdues potential lock order reversal deadlock between 874 * dounmount() and step 5 above. These filesystems may avoid the LOR 875 * by setting VV_CROSSLOCK on the covered vnode so that lock B will 876 * remain held until after step 5. 877 */ 878 int 879 vfs_busy(struct mount *mp, int flags) 880 { 881 struct mount_pcpu *mpcpu; 882 883 MPASS((flags & ~MBF_MASK) == 0); 884 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 885 886 if (vfs_op_thread_enter(mp, mpcpu)) { 887 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 888 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 889 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 890 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 891 vfs_mp_count_add_pcpu(mpcpu, lockref, 1); 892 vfs_op_thread_exit(mp, mpcpu); 893 if (flags & MBF_MNTLSTLOCK) 894 mtx_unlock(&mountlist_mtx); 895 return (0); 896 } 897 898 MNT_ILOCK(mp); 899 vfs_assert_mount_counters(mp); 900 MNT_REF(mp); 901 /* 902 * If mount point is currently being unmounted, sleep until the 903 * mount point fate is decided. If thread doing the unmounting fails, 904 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 905 * that this mount point has survived the unmount attempt and vfs_busy 906 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 907 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 908 * about to be really destroyed. vfs_busy needs to release its 909 * reference on the mount point in this case and return with ENOENT, 910 * telling the caller the mount it tried to busy is no longer valid. 911 */ 912 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 913 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), 914 ("%s: non-empty upper mount list with pending unmount", 915 __func__)); 916 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 917 MNT_REL(mp); 918 MNT_IUNLOCK(mp); 919 CTR1(KTR_VFS, "%s: failed busying before sleeping", 920 __func__); 921 return (ENOENT); 922 } 923 if (flags & MBF_MNTLSTLOCK) 924 mtx_unlock(&mountlist_mtx); 925 mp->mnt_kern_flag |= MNTK_MWAIT; 926 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 927 if (flags & MBF_MNTLSTLOCK) 928 mtx_lock(&mountlist_mtx); 929 MNT_ILOCK(mp); 930 } 931 if (flags & MBF_MNTLSTLOCK) 932 mtx_unlock(&mountlist_mtx); 933 mp->mnt_lockref++; 934 MNT_IUNLOCK(mp); 935 return (0); 936 } 937 938 /* 939 * Free a busy filesystem. 940 */ 941 void 942 vfs_unbusy(struct mount *mp) 943 { 944 struct mount_pcpu *mpcpu; 945 int c; 946 947 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 948 949 if (vfs_op_thread_enter(mp, mpcpu)) { 950 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 951 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); 952 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 953 vfs_op_thread_exit(mp, mpcpu); 954 return; 955 } 956 957 MNT_ILOCK(mp); 958 vfs_assert_mount_counters(mp); 959 MNT_REL(mp); 960 c = --mp->mnt_lockref; 961 if (mp->mnt_vfs_ops == 0) { 962 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 963 MNT_IUNLOCK(mp); 964 return; 965 } 966 if (c < 0) 967 vfs_dump_mount_counters(mp); 968 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 969 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 970 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 971 mp->mnt_kern_flag &= ~MNTK_DRAINING; 972 wakeup(&mp->mnt_lockref); 973 } 974 MNT_IUNLOCK(mp); 975 } 976 977 /* 978 * Lookup a mount point by filesystem identifier. 979 */ 980 struct mount * 981 vfs_getvfs(fsid_t *fsid) 982 { 983 struct mount *mp; 984 985 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 986 mtx_lock(&mountlist_mtx); 987 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 988 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 989 vfs_ref(mp); 990 mtx_unlock(&mountlist_mtx); 991 return (mp); 992 } 993 } 994 mtx_unlock(&mountlist_mtx); 995 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 996 return ((struct mount *) 0); 997 } 998 999 /* 1000 * Lookup a mount point by filesystem identifier, busying it before 1001 * returning. 1002 * 1003 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 1004 * cache for popular filesystem identifiers. The cache is lockess, using 1005 * the fact that struct mount's are never freed. In worst case we may 1006 * get pointer to unmounted or even different filesystem, so we have to 1007 * check what we got, and go slow way if so. 1008 */ 1009 struct mount * 1010 vfs_busyfs(fsid_t *fsid) 1011 { 1012 #define FSID_CACHE_SIZE 256 1013 typedef struct mount * volatile vmp_t; 1014 static vmp_t cache[FSID_CACHE_SIZE]; 1015 struct mount *mp; 1016 int error; 1017 uint32_t hash; 1018 1019 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 1020 hash = fsid->val[0] ^ fsid->val[1]; 1021 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 1022 mp = cache[hash]; 1023 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 1024 goto slow; 1025 if (vfs_busy(mp, 0) != 0) { 1026 cache[hash] = NULL; 1027 goto slow; 1028 } 1029 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 1030 return (mp); 1031 else 1032 vfs_unbusy(mp); 1033 1034 slow: 1035 mtx_lock(&mountlist_mtx); 1036 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 1037 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 1038 error = vfs_busy(mp, MBF_MNTLSTLOCK); 1039 if (error) { 1040 cache[hash] = NULL; 1041 mtx_unlock(&mountlist_mtx); 1042 return (NULL); 1043 } 1044 cache[hash] = mp; 1045 return (mp); 1046 } 1047 } 1048 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 1049 mtx_unlock(&mountlist_mtx); 1050 return ((struct mount *) 0); 1051 } 1052 1053 /* 1054 * Check if a user can access privileged mount options. 1055 */ 1056 int 1057 vfs_suser(struct mount *mp, struct thread *td) 1058 { 1059 int error; 1060 1061 if (jailed(td->td_ucred)) { 1062 /* 1063 * If the jail of the calling thread lacks permission for 1064 * this type of file system, deny immediately. 1065 */ 1066 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 1067 return (EPERM); 1068 1069 /* 1070 * If the file system was mounted outside the jail of the 1071 * calling thread, deny immediately. 1072 */ 1073 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 1074 return (EPERM); 1075 } 1076 1077 /* 1078 * If file system supports delegated administration, we don't check 1079 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 1080 * by the file system itself. 1081 * If this is not the user that did original mount, we check for 1082 * the PRIV_VFS_MOUNT_OWNER privilege. 1083 */ 1084 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 1085 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 1086 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 1087 return (error); 1088 } 1089 return (0); 1090 } 1091 1092 /* 1093 * Get a new unique fsid. Try to make its val[0] unique, since this value 1094 * will be used to create fake device numbers for stat(). Also try (but 1095 * not so hard) make its val[0] unique mod 2^16, since some emulators only 1096 * support 16-bit device numbers. We end up with unique val[0]'s for the 1097 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 1098 * 1099 * Keep in mind that several mounts may be running in parallel. Starting 1100 * the search one past where the previous search terminated is both a 1101 * micro-optimization and a defense against returning the same fsid to 1102 * different mounts. 1103 */ 1104 void 1105 vfs_getnewfsid(struct mount *mp) 1106 { 1107 static uint16_t mntid_base; 1108 struct mount *nmp; 1109 fsid_t tfsid; 1110 int mtype; 1111 1112 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 1113 mtx_lock(&mntid_mtx); 1114 mtype = mp->mnt_vfc->vfc_typenum; 1115 tfsid.val[1] = mtype; 1116 mtype = (mtype & 0xFF) << 24; 1117 for (;;) { 1118 tfsid.val[0] = makedev(255, 1119 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 1120 mntid_base++; 1121 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 1122 break; 1123 vfs_rel(nmp); 1124 } 1125 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 1126 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 1127 mtx_unlock(&mntid_mtx); 1128 } 1129 1130 /* 1131 * Knob to control the precision of file timestamps: 1132 * 1133 * 0 = seconds only; nanoseconds zeroed. 1134 * 1 = seconds and nanoseconds, accurate within 1/HZ. 1135 * 2 = seconds and nanoseconds, truncated to microseconds. 1136 * >=3 = seconds and nanoseconds, maximum precision. 1137 */ 1138 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 1139 1140 static int timestamp_precision = TSP_USEC; 1141 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 1142 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 1143 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 1144 "3+: sec + ns (max. precision))"); 1145 1146 /* 1147 * Get a current timestamp. 1148 */ 1149 void 1150 vfs_timestamp(struct timespec *tsp) 1151 { 1152 struct timeval tv; 1153 1154 switch (timestamp_precision) { 1155 case TSP_SEC: 1156 tsp->tv_sec = time_second; 1157 tsp->tv_nsec = 0; 1158 break; 1159 case TSP_HZ: 1160 getnanotime(tsp); 1161 break; 1162 case TSP_USEC: 1163 microtime(&tv); 1164 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1165 break; 1166 case TSP_NSEC: 1167 default: 1168 nanotime(tsp); 1169 break; 1170 } 1171 } 1172 1173 /* 1174 * Set vnode attributes to VNOVAL 1175 */ 1176 void 1177 vattr_null(struct vattr *vap) 1178 { 1179 1180 vap->va_type = VNON; 1181 vap->va_size = VNOVAL; 1182 vap->va_bytes = VNOVAL; 1183 vap->va_mode = VNOVAL; 1184 vap->va_nlink = VNOVAL; 1185 vap->va_uid = VNOVAL; 1186 vap->va_gid = VNOVAL; 1187 vap->va_fsid = VNOVAL; 1188 vap->va_fileid = VNOVAL; 1189 vap->va_blocksize = VNOVAL; 1190 vap->va_rdev = VNOVAL; 1191 vap->va_atime.tv_sec = VNOVAL; 1192 vap->va_atime.tv_nsec = VNOVAL; 1193 vap->va_mtime.tv_sec = VNOVAL; 1194 vap->va_mtime.tv_nsec = VNOVAL; 1195 vap->va_ctime.tv_sec = VNOVAL; 1196 vap->va_ctime.tv_nsec = VNOVAL; 1197 vap->va_birthtime.tv_sec = VNOVAL; 1198 vap->va_birthtime.tv_nsec = VNOVAL; 1199 vap->va_flags = VNOVAL; 1200 vap->va_gen = VNOVAL; 1201 vap->va_vaflags = 0; 1202 vap->va_filerev = VNOVAL; 1203 vap->va_bsdflags = 0; 1204 } 1205 1206 /* 1207 * Try to reduce the total number of vnodes. 1208 * 1209 * This routine (and its user) are buggy in at least the following ways: 1210 * - all parameters were picked years ago when RAM sizes were significantly 1211 * smaller 1212 * - it can pick vnodes based on pages used by the vm object, but filesystems 1213 * like ZFS don't use it making the pick broken 1214 * - since ZFS has its own aging policy it gets partially combated by this one 1215 * - a dedicated method should be provided for filesystems to let them decide 1216 * whether the vnode should be recycled 1217 * 1218 * This routine is called when we have too many vnodes. It attempts 1219 * to free <count> vnodes and will potentially free vnodes that still 1220 * have VM backing store (VM backing store is typically the cause 1221 * of a vnode blowout so we want to do this). Therefore, this operation 1222 * is not considered cheap. 1223 * 1224 * A number of conditions may prevent a vnode from being reclaimed. 1225 * the buffer cache may have references on the vnode, a directory 1226 * vnode may still have references due to the namei cache representing 1227 * underlying files, or the vnode may be in active use. It is not 1228 * desirable to reuse such vnodes. These conditions may cause the 1229 * number of vnodes to reach some minimum value regardless of what 1230 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1231 * 1232 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1233 * entries if this argument is strue 1234 * @param trigger Only reclaim vnodes with fewer than this many resident 1235 * pages. 1236 * @param target How many vnodes to reclaim. 1237 * @return The number of vnodes that were reclaimed. 1238 */ 1239 static int 1240 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1241 { 1242 struct vnode *vp, *mvp; 1243 struct mount *mp; 1244 struct vm_object *object; 1245 u_long done; 1246 bool retried; 1247 1248 mtx_assert(&vnode_list_mtx, MA_OWNED); 1249 1250 retried = false; 1251 done = 0; 1252 1253 mvp = vnode_list_reclaim_marker; 1254 restart: 1255 vp = mvp; 1256 while (done < target) { 1257 vp = TAILQ_NEXT(vp, v_vnodelist); 1258 if (__predict_false(vp == NULL)) 1259 break; 1260 1261 if (__predict_false(vp->v_type == VMARKER)) 1262 continue; 1263 1264 /* 1265 * If it's been deconstructed already, it's still 1266 * referenced, or it exceeds the trigger, skip it. 1267 * Also skip free vnodes. We are trying to make space 1268 * for more free vnodes, not reduce their count. 1269 */ 1270 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1271 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1272 goto next_iter; 1273 1274 if (vp->v_type == VBAD || vp->v_type == VNON) 1275 goto next_iter; 1276 1277 object = atomic_load_ptr(&vp->v_object); 1278 if (object == NULL || object->resident_page_count > trigger) { 1279 goto next_iter; 1280 } 1281 1282 /* 1283 * Handle races against vnode allocation. Filesystems lock the 1284 * vnode some time after it gets returned from getnewvnode, 1285 * despite type and hold count being manipulated earlier. 1286 * Resorting to checking v_mount restores guarantees present 1287 * before the global list was reworked to contain all vnodes. 1288 */ 1289 if (!VI_TRYLOCK(vp)) 1290 goto next_iter; 1291 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1292 VI_UNLOCK(vp); 1293 goto next_iter; 1294 } 1295 if (vp->v_mount == NULL) { 1296 VI_UNLOCK(vp); 1297 goto next_iter; 1298 } 1299 vholdl(vp); 1300 VI_UNLOCK(vp); 1301 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1302 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1303 mtx_unlock(&vnode_list_mtx); 1304 1305 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1306 vdrop_recycle(vp); 1307 goto next_iter_unlocked; 1308 } 1309 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1310 vdrop_recycle(vp); 1311 vn_finished_write(mp); 1312 goto next_iter_unlocked; 1313 } 1314 1315 VI_LOCK(vp); 1316 if (vp->v_usecount > 0 || 1317 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1318 (vp->v_object != NULL && vp->v_object->handle == vp && 1319 vp->v_object->resident_page_count > trigger)) { 1320 VOP_UNLOCK(vp); 1321 vdropl_recycle(vp); 1322 vn_finished_write(mp); 1323 goto next_iter_unlocked; 1324 } 1325 recycles_count++; 1326 vgonel(vp); 1327 VOP_UNLOCK(vp); 1328 vdropl_recycle(vp); 1329 vn_finished_write(mp); 1330 done++; 1331 next_iter_unlocked: 1332 maybe_yield(); 1333 mtx_lock(&vnode_list_mtx); 1334 goto restart; 1335 next_iter: 1336 MPASS(vp->v_type != VMARKER); 1337 if (!should_yield()) 1338 continue; 1339 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1340 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1341 mtx_unlock(&vnode_list_mtx); 1342 kern_yield(PRI_USER); 1343 mtx_lock(&vnode_list_mtx); 1344 goto restart; 1345 } 1346 if (done == 0 && !retried) { 1347 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1348 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1349 retried = true; 1350 goto restart; 1351 } 1352 return (done); 1353 } 1354 1355 static int max_free_per_call = 10000; 1356 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_free_per_call, 0, 1357 "limit on vnode free requests per call to the vnlru_free routine (legacy)"); 1358 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, max_free_per_call, CTLFLAG_RW, 1359 &max_free_per_call, 0, 1360 "limit on vnode free requests per call to the vnlru_free routine"); 1361 1362 /* 1363 * Attempt to recycle requested amount of free vnodes. 1364 */ 1365 static int 1366 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp, bool isvnlru) 1367 { 1368 struct vnode *vp; 1369 struct mount *mp; 1370 int ocount; 1371 bool retried; 1372 1373 mtx_assert(&vnode_list_mtx, MA_OWNED); 1374 if (count > max_free_per_call) 1375 count = max_free_per_call; 1376 if (count == 0) { 1377 mtx_unlock(&vnode_list_mtx); 1378 return (0); 1379 } 1380 ocount = count; 1381 retried = false; 1382 vp = mvp; 1383 for (;;) { 1384 vp = TAILQ_NEXT(vp, v_vnodelist); 1385 if (__predict_false(vp == NULL)) { 1386 /* 1387 * The free vnode marker can be past eligible vnodes: 1388 * 1. if vdbatch_process trylock failed 1389 * 2. if vtryrecycle failed 1390 * 1391 * If so, start the scan from scratch. 1392 */ 1393 if (!retried && vnlru_read_freevnodes() > 0) { 1394 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1395 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1396 vp = mvp; 1397 retried = true; 1398 continue; 1399 } 1400 1401 /* 1402 * Give up 1403 */ 1404 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1405 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1406 mtx_unlock(&vnode_list_mtx); 1407 break; 1408 } 1409 if (__predict_false(vp->v_type == VMARKER)) 1410 continue; 1411 if (vp->v_holdcnt > 0) 1412 continue; 1413 /* 1414 * Don't recycle if our vnode is from different type 1415 * of mount point. Note that mp is type-safe, the 1416 * check does not reach unmapped address even if 1417 * vnode is reclaimed. 1418 */ 1419 if (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1420 mp->mnt_op != mnt_op) { 1421 continue; 1422 } 1423 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1424 continue; 1425 } 1426 if (!vhold_recycle_free(vp)) 1427 continue; 1428 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1429 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1430 mtx_unlock(&vnode_list_mtx); 1431 /* 1432 * FIXME: ignores the return value, meaning it may be nothing 1433 * got recycled but it claims otherwise to the caller. 1434 * 1435 * Originally the value started being ignored in 2005 with 1436 * 114a1006a8204aa156e1f9ad6476cdff89cada7f . 1437 * 1438 * Respecting the value can run into significant stalls if most 1439 * vnodes belong to one file system and it has writes 1440 * suspended. In presence of many threads and millions of 1441 * vnodes they keep contending on the vnode_list_mtx lock only 1442 * to find vnodes they can't recycle. 1443 * 1444 * The solution would be to pre-check if the vnode is likely to 1445 * be recycle-able, but it needs to happen with the 1446 * vnode_list_mtx lock held. This runs into a problem where 1447 * VOP_GETWRITEMOUNT (currently needed to find out about if 1448 * writes are frozen) can take locks which LOR against it. 1449 * 1450 * Check nullfs for one example (null_getwritemount). 1451 */ 1452 vtryrecycle(vp, isvnlru); 1453 count--; 1454 if (count == 0) { 1455 break; 1456 } 1457 mtx_lock(&vnode_list_mtx); 1458 vp = mvp; 1459 } 1460 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1461 return (ocount - count); 1462 } 1463 1464 /* 1465 * XXX: returns without vnode_list_mtx locked! 1466 */ 1467 static int 1468 vnlru_free_locked_direct(int count) 1469 { 1470 int ret; 1471 1472 mtx_assert(&vnode_list_mtx, MA_OWNED); 1473 ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, false); 1474 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1475 return (ret); 1476 } 1477 1478 static int 1479 vnlru_free_locked_vnlru(int count) 1480 { 1481 int ret; 1482 1483 mtx_assert(&vnode_list_mtx, MA_OWNED); 1484 ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, true); 1485 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1486 return (ret); 1487 } 1488 1489 static int 1490 vnlru_free_vnlru(int count) 1491 { 1492 1493 mtx_lock(&vnode_list_mtx); 1494 return (vnlru_free_locked_vnlru(count)); 1495 } 1496 1497 void 1498 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) 1499 { 1500 1501 MPASS(mnt_op != NULL); 1502 MPASS(mvp != NULL); 1503 VNPASS(mvp->v_type == VMARKER, mvp); 1504 mtx_lock(&vnode_list_mtx); 1505 vnlru_free_impl(count, mnt_op, mvp, true); 1506 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1507 } 1508 1509 struct vnode * 1510 vnlru_alloc_marker(void) 1511 { 1512 struct vnode *mvp; 1513 1514 mvp = vn_alloc_marker(NULL); 1515 mtx_lock(&vnode_list_mtx); 1516 TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); 1517 mtx_unlock(&vnode_list_mtx); 1518 return (mvp); 1519 } 1520 1521 void 1522 vnlru_free_marker(struct vnode *mvp) 1523 { 1524 mtx_lock(&vnode_list_mtx); 1525 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1526 mtx_unlock(&vnode_list_mtx); 1527 vn_free_marker(mvp); 1528 } 1529 1530 static void 1531 vnlru_recalc(void) 1532 { 1533 1534 mtx_assert(&vnode_list_mtx, MA_OWNED); 1535 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1536 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1537 vlowat = vhiwat / 2; 1538 } 1539 1540 /* 1541 * Attempt to recycle vnodes in a context that is always safe to block. 1542 * Calling vlrurecycle() from the bowels of filesystem code has some 1543 * interesting deadlock problems. 1544 */ 1545 static struct proc *vnlruproc; 1546 static int vnlruproc_sig; 1547 static u_long vnlruproc_kicks; 1548 1549 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, kicks, CTLFLAG_RD, &vnlruproc_kicks, 0, 1550 "Number of times vnlru awakened due to vnode shortage"); 1551 1552 #define VNLRU_COUNT_SLOP 100 1553 1554 /* 1555 * The main freevnodes counter is only updated when a counter local to CPU 1556 * diverges from 0 by more than VNLRU_FREEVNODES_SLOP. CPUs are conditionally 1557 * walked to compute a more accurate total. 1558 * 1559 * Note: the actual value at any given moment can still exceed slop, but it 1560 * should not be by significant margin in practice. 1561 */ 1562 #define VNLRU_FREEVNODES_SLOP 126 1563 1564 static void __noinline 1565 vfs_freevnodes_rollup(int8_t *lfreevnodes) 1566 { 1567 1568 atomic_add_long(&freevnodes, *lfreevnodes); 1569 *lfreevnodes = 0; 1570 critical_exit(); 1571 } 1572 1573 static __inline void 1574 vfs_freevnodes_inc(void) 1575 { 1576 int8_t *lfreevnodes; 1577 1578 critical_enter(); 1579 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1580 (*lfreevnodes)++; 1581 if (__predict_false(*lfreevnodes == VNLRU_FREEVNODES_SLOP)) 1582 vfs_freevnodes_rollup(lfreevnodes); 1583 else 1584 critical_exit(); 1585 } 1586 1587 static __inline void 1588 vfs_freevnodes_dec(void) 1589 { 1590 int8_t *lfreevnodes; 1591 1592 critical_enter(); 1593 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1594 (*lfreevnodes)--; 1595 if (__predict_false(*lfreevnodes == -VNLRU_FREEVNODES_SLOP)) 1596 vfs_freevnodes_rollup(lfreevnodes); 1597 else 1598 critical_exit(); 1599 } 1600 1601 static u_long 1602 vnlru_read_freevnodes(void) 1603 { 1604 long slop, rfreevnodes, rfreevnodes_old; 1605 int cpu; 1606 1607 rfreevnodes = atomic_load_long(&freevnodes); 1608 rfreevnodes_old = atomic_load_long(&freevnodes_old); 1609 1610 if (rfreevnodes > rfreevnodes_old) 1611 slop = rfreevnodes - rfreevnodes_old; 1612 else 1613 slop = rfreevnodes_old - rfreevnodes; 1614 if (slop < VNLRU_FREEVNODES_SLOP) 1615 return (rfreevnodes >= 0 ? rfreevnodes : 0); 1616 CPU_FOREACH(cpu) { 1617 rfreevnodes += cpuid_to_pcpu[cpu]->pc_vfs_freevnodes; 1618 } 1619 atomic_store_long(&freevnodes_old, rfreevnodes); 1620 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1621 } 1622 1623 static bool 1624 vnlru_under(u_long rnumvnodes, u_long limit) 1625 { 1626 u_long rfreevnodes, space; 1627 1628 if (__predict_false(rnumvnodes > desiredvnodes)) 1629 return (true); 1630 1631 space = desiredvnodes - rnumvnodes; 1632 if (space < limit) { 1633 rfreevnodes = vnlru_read_freevnodes(); 1634 if (rfreevnodes > wantfreevnodes) 1635 space += rfreevnodes - wantfreevnodes; 1636 } 1637 return (space < limit); 1638 } 1639 1640 static void 1641 vnlru_kick_locked(void) 1642 { 1643 1644 mtx_assert(&vnode_list_mtx, MA_OWNED); 1645 if (vnlruproc_sig == 0) { 1646 vnlruproc_sig = 1; 1647 vnlruproc_kicks++; 1648 wakeup(vnlruproc); 1649 } 1650 } 1651 1652 static void 1653 vnlru_kick_cond(void) 1654 { 1655 1656 if (vnlru_read_freevnodes() > wantfreevnodes) 1657 return; 1658 1659 if (vnlruproc_sig) 1660 return; 1661 mtx_lock(&vnode_list_mtx); 1662 vnlru_kick_locked(); 1663 mtx_unlock(&vnode_list_mtx); 1664 } 1665 1666 static void 1667 vnlru_proc_sleep(void) 1668 { 1669 1670 if (vnlruproc_sig) { 1671 vnlruproc_sig = 0; 1672 wakeup(&vnlruproc_sig); 1673 } 1674 msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); 1675 } 1676 1677 /* 1678 * A lighter version of the machinery below. 1679 * 1680 * Tries to reach goals only by recycling free vnodes and does not invoke 1681 * uma_reclaim(UMA_RECLAIM_DRAIN). 1682 * 1683 * This works around pathological behavior in vnlru in presence of tons of free 1684 * vnodes, but without having to rewrite the machinery at this time. Said 1685 * behavior boils down to continuously trying to reclaim all kinds of vnodes 1686 * (cycling through all levels of "force") when the count is transiently above 1687 * limit. This happens a lot when all vnodes are used up and vn_alloc 1688 * speculatively increments the counter. 1689 * 1690 * Sample testcase: vnode limit 8388608, 20 separate directory trees each with 1691 * 1 million files in total and 20 find(1) processes stating them in parallel 1692 * (one per each tree). 1693 * 1694 * On a kernel with only stock machinery this needs anywhere between 60 and 120 1695 * seconds to execute (time varies *wildly* between runs). With the workaround 1696 * it consistently stays around 20 seconds [it got further down with later 1697 * changes]. 1698 * 1699 * That is to say the entire thing needs a fundamental redesign (most notably 1700 * to accommodate faster recycling), the above only tries to get it ouf the way. 1701 * 1702 * Return values are: 1703 * -1 -- fallback to regular vnlru loop 1704 * 0 -- do nothing, go to sleep 1705 * >0 -- recycle this many vnodes 1706 */ 1707 static long 1708 vnlru_proc_light_pick(void) 1709 { 1710 u_long rnumvnodes, rfreevnodes; 1711 1712 if (vstir || vnlruproc_sig == 1) 1713 return (-1); 1714 1715 rnumvnodes = atomic_load_long(&numvnodes); 1716 rfreevnodes = vnlru_read_freevnodes(); 1717 1718 /* 1719 * vnode limit might have changed and now we may be at a significant 1720 * excess. Bail if we can't sort it out with free vnodes. 1721 * 1722 * Due to atomic updates the count can legitimately go above 1723 * the limit for a short period, don't bother doing anything in 1724 * that case. 1725 */ 1726 if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP + 10) { 1727 if (rnumvnodes - rfreevnodes >= desiredvnodes || 1728 rfreevnodes <= wantfreevnodes) { 1729 return (-1); 1730 } 1731 1732 return (rnumvnodes - desiredvnodes); 1733 } 1734 1735 /* 1736 * Don't try to reach wantfreevnodes target if there are too few vnodes 1737 * to begin with. 1738 */ 1739 if (rnumvnodes < wantfreevnodes) { 1740 return (0); 1741 } 1742 1743 if (rfreevnodes < wantfreevnodes) { 1744 return (-1); 1745 } 1746 1747 return (0); 1748 } 1749 1750 static bool 1751 vnlru_proc_light(void) 1752 { 1753 long freecount; 1754 1755 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1756 1757 freecount = vnlru_proc_light_pick(); 1758 if (freecount == -1) 1759 return (false); 1760 1761 if (freecount != 0) { 1762 vnlru_free_vnlru(freecount); 1763 } 1764 1765 mtx_lock(&vnode_list_mtx); 1766 vnlru_proc_sleep(); 1767 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1768 return (true); 1769 } 1770 1771 static u_long uma_reclaim_calls; 1772 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, uma_reclaim_calls, CTLFLAG_RD | CTLFLAG_STATS, 1773 &uma_reclaim_calls, 0, "Number of calls to uma_reclaim"); 1774 1775 static void 1776 vnlru_proc(void) 1777 { 1778 u_long rnumvnodes, rfreevnodes, target; 1779 unsigned long onumvnodes; 1780 int done, force, trigger, usevnodes; 1781 bool reclaim_nc_src, want_reread; 1782 1783 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1784 SHUTDOWN_PRI_FIRST); 1785 1786 force = 0; 1787 want_reread = false; 1788 for (;;) { 1789 kproc_suspend_check(vnlruproc); 1790 1791 if (force == 0 && vnlru_proc_light()) 1792 continue; 1793 1794 mtx_lock(&vnode_list_mtx); 1795 rnumvnodes = atomic_load_long(&numvnodes); 1796 1797 if (want_reread) { 1798 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1799 want_reread = false; 1800 } 1801 1802 /* 1803 * If numvnodes is too large (due to desiredvnodes being 1804 * adjusted using its sysctl, or emergency growth), first 1805 * try to reduce it by discarding free vnodes. 1806 */ 1807 if (rnumvnodes > desiredvnodes + 10) { 1808 vnlru_free_locked_vnlru(rnumvnodes - desiredvnodes); 1809 mtx_lock(&vnode_list_mtx); 1810 rnumvnodes = atomic_load_long(&numvnodes); 1811 } 1812 /* 1813 * Sleep if the vnode cache is in a good state. This is 1814 * when it is not over-full and has space for about a 4% 1815 * or 9% expansion (by growing its size or inexcessively 1816 * reducing free vnode count). Otherwise, try to reclaim 1817 * space for a 10% expansion. 1818 */ 1819 if (vstir && force == 0) { 1820 force = 1; 1821 vstir = false; 1822 } 1823 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1824 vnlru_proc_sleep(); 1825 continue; 1826 } 1827 rfreevnodes = vnlru_read_freevnodes(); 1828 1829 onumvnodes = rnumvnodes; 1830 /* 1831 * Calculate parameters for recycling. These are the same 1832 * throughout the loop to give some semblance of fairness. 1833 * The trigger point is to avoid recycling vnodes with lots 1834 * of resident pages. We aren't trying to free memory; we 1835 * are trying to recycle or at least free vnodes. 1836 */ 1837 if (rnumvnodes <= desiredvnodes) 1838 usevnodes = rnumvnodes - rfreevnodes; 1839 else 1840 usevnodes = rnumvnodes; 1841 if (usevnodes <= 0) 1842 usevnodes = 1; 1843 /* 1844 * The trigger value is chosen to give a conservatively 1845 * large value to ensure that it alone doesn't prevent 1846 * making progress. The value can easily be so large that 1847 * it is effectively infinite in some congested and 1848 * misconfigured cases, and this is necessary. Normally 1849 * it is about 8 to 100 (pages), which is quite large. 1850 */ 1851 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1852 if (force < 2) 1853 trigger = vsmalltrigger; 1854 reclaim_nc_src = force >= 3; 1855 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1856 target = target / 10 + 1; 1857 done = vlrureclaim(reclaim_nc_src, trigger, target); 1858 mtx_unlock(&vnode_list_mtx); 1859 /* 1860 * Total number of vnodes can transiently go slightly above the 1861 * limit (see vn_alloc_hard), no need to call uma_reclaim if 1862 * this happens. 1863 */ 1864 if (onumvnodes + VNLRU_COUNT_SLOP + 1000 > desiredvnodes && 1865 numvnodes <= desiredvnodes) { 1866 uma_reclaim_calls++; 1867 uma_reclaim(UMA_RECLAIM_DRAIN); 1868 } 1869 if (done == 0) { 1870 if (force == 0 || force == 1) { 1871 force = 2; 1872 continue; 1873 } 1874 if (force == 2) { 1875 force = 3; 1876 continue; 1877 } 1878 want_reread = true; 1879 force = 0; 1880 vnlru_nowhere++; 1881 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1882 } else { 1883 want_reread = true; 1884 kern_yield(PRI_USER); 1885 } 1886 } 1887 } 1888 1889 static struct kproc_desc vnlru_kp = { 1890 "vnlru", 1891 vnlru_proc, 1892 &vnlruproc 1893 }; 1894 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1895 &vnlru_kp); 1896 1897 /* 1898 * Routines having to do with the management of the vnode table. 1899 */ 1900 1901 /* 1902 * Try to recycle a freed vnode. 1903 */ 1904 static int 1905 vtryrecycle(struct vnode *vp, bool isvnlru) 1906 { 1907 struct mount *vnmp; 1908 1909 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1910 VNPASS(vp->v_holdcnt > 0, vp); 1911 /* 1912 * This vnode may found and locked via some other list, if so we 1913 * can't recycle it yet. 1914 */ 1915 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1916 CTR2(KTR_VFS, 1917 "%s: impossible to recycle, vp %p lock is already held", 1918 __func__, vp); 1919 vdrop_recycle(vp); 1920 return (EWOULDBLOCK); 1921 } 1922 /* 1923 * Don't recycle if its filesystem is being suspended. 1924 */ 1925 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1926 VOP_UNLOCK(vp); 1927 CTR2(KTR_VFS, 1928 "%s: impossible to recycle, cannot start the write for %p", 1929 __func__, vp); 1930 vdrop_recycle(vp); 1931 return (EBUSY); 1932 } 1933 /* 1934 * If we got this far, we need to acquire the interlock and see if 1935 * anyone picked up this vnode from another list. If not, we will 1936 * mark it with DOOMED via vgonel() so that anyone who does find it 1937 * will skip over it. 1938 */ 1939 VI_LOCK(vp); 1940 if (vp->v_usecount) { 1941 VOP_UNLOCK(vp); 1942 vdropl_recycle(vp); 1943 vn_finished_write(vnmp); 1944 CTR2(KTR_VFS, 1945 "%s: impossible to recycle, %p is already referenced", 1946 __func__, vp); 1947 return (EBUSY); 1948 } 1949 if (!VN_IS_DOOMED(vp)) { 1950 if (isvnlru) 1951 recycles_free_count++; 1952 else 1953 counter_u64_add(direct_recycles_free_count, 1); 1954 vgonel(vp); 1955 } 1956 VOP_UNLOCK(vp); 1957 vdropl_recycle(vp); 1958 vn_finished_write(vnmp); 1959 return (0); 1960 } 1961 1962 /* 1963 * Allocate a new vnode. 1964 * 1965 * The operation never returns an error. Returning an error was disabled 1966 * in r145385 (dated 2005) with the following comment: 1967 * 1968 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1969 * 1970 * Given the age of this commit (almost 15 years at the time of writing this 1971 * comment) restoring the ability to fail requires a significant audit of 1972 * all codepaths. 1973 * 1974 * The routine can try to free a vnode or stall for up to 1 second waiting for 1975 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1976 */ 1977 static u_long vn_alloc_cyclecount; 1978 static u_long vn_alloc_sleeps; 1979 1980 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, alloc_sleeps, CTLFLAG_RD, &vn_alloc_sleeps, 0, 1981 "Number of times vnode allocation blocked waiting on vnlru"); 1982 1983 static struct vnode * __noinline 1984 vn_alloc_hard(struct mount *mp, u_long rnumvnodes, bool bumped) 1985 { 1986 u_long rfreevnodes; 1987 1988 if (bumped) { 1989 if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP) { 1990 atomic_subtract_long(&numvnodes, 1); 1991 bumped = false; 1992 } 1993 } 1994 1995 mtx_lock(&vnode_list_mtx); 1996 1997 /* 1998 * Reload 'numvnodes', as since we acquired the lock, it may have 1999 * changed significantly if we waited, and 'rnumvnodes' above was only 2000 * actually passed if 'bumped' is true (else it is 0). 2001 */ 2002 rnumvnodes = atomic_load_long(&numvnodes); 2003 if (rnumvnodes + !bumped < desiredvnodes) { 2004 vn_alloc_cyclecount = 0; 2005 mtx_unlock(&vnode_list_mtx); 2006 goto alloc; 2007 } 2008 2009 rfreevnodes = vnlru_read_freevnodes(); 2010 if (vn_alloc_cyclecount++ >= rfreevnodes) { 2011 vn_alloc_cyclecount = 0; 2012 vstir = true; 2013 } 2014 2015 /* 2016 * Grow the vnode cache if it will not be above its target max after 2017 * growing. Otherwise, if there is at least one free vnode, try to 2018 * reclaim 1 item from it before growing the cache (possibly above its 2019 * target max if the reclamation failed or is delayed). 2020 */ 2021 if (vnlru_free_locked_direct(1) > 0) 2022 goto alloc; 2023 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 2024 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2025 /* 2026 * Wait for space for a new vnode. 2027 */ 2028 if (bumped) { 2029 atomic_subtract_long(&numvnodes, 1); 2030 bumped = false; 2031 } 2032 mtx_lock(&vnode_list_mtx); 2033 vnlru_kick_locked(); 2034 vn_alloc_sleeps++; 2035 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 2036 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 2037 vnlru_read_freevnodes() > 1) 2038 vnlru_free_locked_direct(1); 2039 else 2040 mtx_unlock(&vnode_list_mtx); 2041 } 2042 alloc: 2043 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 2044 if (!bumped) 2045 atomic_add_long(&numvnodes, 1); 2046 vnlru_kick_cond(); 2047 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 2048 } 2049 2050 static struct vnode * 2051 vn_alloc(struct mount *mp) 2052 { 2053 u_long rnumvnodes; 2054 2055 if (__predict_false(vn_alloc_cyclecount != 0)) 2056 return (vn_alloc_hard(mp, 0, false)); 2057 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 2058 if (__predict_false(vnlru_under(rnumvnodes, vlowat))) { 2059 return (vn_alloc_hard(mp, rnumvnodes, true)); 2060 } 2061 2062 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 2063 } 2064 2065 static void 2066 vn_free(struct vnode *vp) 2067 { 2068 2069 atomic_subtract_long(&numvnodes, 1); 2070 uma_zfree_smr(vnode_zone, vp); 2071 } 2072 2073 /* 2074 * Allocate a new vnode. 2075 */ 2076 int 2077 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 2078 struct vnode **vpp) 2079 { 2080 struct vnode *vp; 2081 struct thread *td; 2082 struct lock_object *lo; 2083 2084 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 2085 2086 KASSERT(vops->registered, 2087 ("%s: not registered vector op %p\n", __func__, vops)); 2088 cache_validate_vop_vector(mp, vops); 2089 2090 td = curthread; 2091 if (td->td_vp_reserved != NULL) { 2092 vp = td->td_vp_reserved; 2093 td->td_vp_reserved = NULL; 2094 } else { 2095 vp = vn_alloc(mp); 2096 } 2097 counter_u64_add(vnodes_created, 1); 2098 2099 vn_set_state(vp, VSTATE_UNINITIALIZED); 2100 2101 /* 2102 * Locks are given the generic name "vnode" when created. 2103 * Follow the historic practice of using the filesystem 2104 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 2105 * 2106 * Locks live in a witness group keyed on their name. Thus, 2107 * when a lock is renamed, it must also move from the witness 2108 * group of its old name to the witness group of its new name. 2109 * 2110 * The change only needs to be made when the vnode moves 2111 * from one filesystem type to another. We ensure that each 2112 * filesystem use a single static name pointer for its tag so 2113 * that we can compare pointers rather than doing a strcmp(). 2114 */ 2115 lo = &vp->v_vnlock->lock_object; 2116 #ifdef WITNESS 2117 if (lo->lo_name != tag) { 2118 #endif 2119 lo->lo_name = tag; 2120 #ifdef WITNESS 2121 WITNESS_DESTROY(lo); 2122 WITNESS_INIT(lo, tag); 2123 } 2124 #endif 2125 /* 2126 * By default, don't allow shared locks unless filesystems opt-in. 2127 */ 2128 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 2129 /* 2130 * Finalize various vnode identity bits. 2131 */ 2132 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 2133 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 2134 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 2135 vp->v_type = VNON; 2136 vp->v_op = vops; 2137 vp->v_irflag = 0; 2138 v_init_counters(vp); 2139 vn_seqc_init(vp); 2140 vp->v_bufobj.bo_ops = &buf_ops_bio; 2141 #ifdef DIAGNOSTIC 2142 if (mp == NULL && vops != &dead_vnodeops) 2143 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 2144 #endif 2145 #ifdef MAC 2146 mac_vnode_init(vp); 2147 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 2148 mac_vnode_associate_singlelabel(mp, vp); 2149 #endif 2150 if (mp != NULL) { 2151 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 2152 } 2153 2154 /* 2155 * For the filesystems which do not use vfs_hash_insert(), 2156 * still initialize v_hash to have vfs_hash_index() useful. 2157 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 2158 * its own hashing. 2159 */ 2160 vp->v_hash = (uintptr_t)vp >> vnsz2log; 2161 2162 *vpp = vp; 2163 return (0); 2164 } 2165 2166 void 2167 getnewvnode_reserve(void) 2168 { 2169 struct thread *td; 2170 2171 td = curthread; 2172 MPASS(td->td_vp_reserved == NULL); 2173 td->td_vp_reserved = vn_alloc(NULL); 2174 } 2175 2176 void 2177 getnewvnode_drop_reserve(void) 2178 { 2179 struct thread *td; 2180 2181 td = curthread; 2182 if (td->td_vp_reserved != NULL) { 2183 vn_free(td->td_vp_reserved); 2184 td->td_vp_reserved = NULL; 2185 } 2186 } 2187 2188 static void __noinline 2189 freevnode(struct vnode *vp) 2190 { 2191 struct bufobj *bo; 2192 2193 /* 2194 * The vnode has been marked for destruction, so free it. 2195 * 2196 * The vnode will be returned to the zone where it will 2197 * normally remain until it is needed for another vnode. We 2198 * need to cleanup (or verify that the cleanup has already 2199 * been done) any residual data left from its current use 2200 * so as not to contaminate the freshly allocated vnode. 2201 */ 2202 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2203 /* 2204 * Paired with vgone. 2205 */ 2206 vn_seqc_write_end_free(vp); 2207 2208 bo = &vp->v_bufobj; 2209 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2210 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 2211 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2212 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2213 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2214 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2215 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 2216 ("clean blk trie not empty")); 2217 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2218 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 2219 ("dirty blk trie not empty")); 2220 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, 2221 ("Leaked inactivation")); 2222 VI_UNLOCK(vp); 2223 cache_assert_no_entries(vp); 2224 2225 #ifdef MAC 2226 mac_vnode_destroy(vp); 2227 #endif 2228 if (vp->v_pollinfo != NULL) { 2229 /* 2230 * Use LK_NOWAIT to shut up witness about the lock. We may get 2231 * here while having another vnode locked when trying to 2232 * satisfy a lookup and needing to recycle. 2233 */ 2234 VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); 2235 destroy_vpollinfo(vp->v_pollinfo); 2236 VOP_UNLOCK(vp); 2237 vp->v_pollinfo = NULL; 2238 } 2239 vp->v_mountedhere = NULL; 2240 vp->v_unpcb = NULL; 2241 vp->v_rdev = NULL; 2242 vp->v_fifoinfo = NULL; 2243 vp->v_iflag = 0; 2244 vp->v_vflag = 0; 2245 bo->bo_flag = 0; 2246 vn_free(vp); 2247 } 2248 2249 /* 2250 * Delete from old mount point vnode list, if on one. 2251 */ 2252 static void 2253 delmntque(struct vnode *vp) 2254 { 2255 struct mount *mp; 2256 2257 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 2258 2259 mp = vp->v_mount; 2260 MNT_ILOCK(mp); 2261 VI_LOCK(vp); 2262 vp->v_mount = NULL; 2263 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 2264 ("bad mount point vnode list size")); 2265 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2266 mp->mnt_nvnodelistsize--; 2267 MNT_REL(mp); 2268 MNT_IUNLOCK(mp); 2269 /* 2270 * The caller expects the interlock to be still held. 2271 */ 2272 ASSERT_VI_LOCKED(vp, __func__); 2273 } 2274 2275 static int 2276 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) 2277 { 2278 2279 KASSERT(vp->v_mount == NULL, 2280 ("insmntque: vnode already on per mount vnode list")); 2281 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 2282 if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { 2283 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 2284 } else { 2285 KASSERT(!dtr, 2286 ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", 2287 __func__)); 2288 } 2289 2290 /* 2291 * We acquire the vnode interlock early to ensure that the 2292 * vnode cannot be recycled by another process releasing a 2293 * holdcnt on it before we get it on both the vnode list 2294 * and the active vnode list. The mount mutex protects only 2295 * manipulation of the vnode list and the vnode freelist 2296 * mutex protects only manipulation of the active vnode list. 2297 * Hence the need to hold the vnode interlock throughout. 2298 */ 2299 MNT_ILOCK(mp); 2300 VI_LOCK(vp); 2301 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 2302 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 2303 mp->mnt_nvnodelistsize == 0)) && 2304 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 2305 VI_UNLOCK(vp); 2306 MNT_IUNLOCK(mp); 2307 if (dtr) { 2308 vp->v_data = NULL; 2309 vp->v_op = &dead_vnodeops; 2310 vgone(vp); 2311 vput(vp); 2312 } 2313 return (EBUSY); 2314 } 2315 vp->v_mount = mp; 2316 MNT_REF(mp); 2317 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2318 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 2319 ("neg mount point vnode list size")); 2320 mp->mnt_nvnodelistsize++; 2321 VI_UNLOCK(vp); 2322 MNT_IUNLOCK(mp); 2323 return (0); 2324 } 2325 2326 /* 2327 * Insert into list of vnodes for the new mount point, if available. 2328 * insmntque() reclaims the vnode on insertion failure, insmntque1() 2329 * leaves handling of the vnode to the caller. 2330 */ 2331 int 2332 insmntque(struct vnode *vp, struct mount *mp) 2333 { 2334 return (insmntque1_int(vp, mp, true)); 2335 } 2336 2337 int 2338 insmntque1(struct vnode *vp, struct mount *mp) 2339 { 2340 return (insmntque1_int(vp, mp, false)); 2341 } 2342 2343 /* 2344 * Flush out and invalidate all buffers associated with a bufobj 2345 * Called with the underlying object locked. 2346 */ 2347 int 2348 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 2349 { 2350 int error; 2351 2352 BO_LOCK(bo); 2353 if (flags & V_SAVE) { 2354 error = bufobj_wwait(bo, slpflag, slptimeo); 2355 if (error) { 2356 BO_UNLOCK(bo); 2357 return (error); 2358 } 2359 if (bo->bo_dirty.bv_cnt > 0) { 2360 BO_UNLOCK(bo); 2361 do { 2362 error = BO_SYNC(bo, MNT_WAIT); 2363 } while (error == ERELOOKUP); 2364 if (error != 0) 2365 return (error); 2366 BO_LOCK(bo); 2367 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { 2368 BO_UNLOCK(bo); 2369 return (EBUSY); 2370 } 2371 } 2372 } 2373 /* 2374 * If you alter this loop please notice that interlock is dropped and 2375 * reacquired in flushbuflist. Special care is needed to ensure that 2376 * no race conditions occur from this. 2377 */ 2378 do { 2379 error = flushbuflist(&bo->bo_clean, 2380 flags, bo, slpflag, slptimeo); 2381 if (error == 0 && !(flags & V_CLEANONLY)) 2382 error = flushbuflist(&bo->bo_dirty, 2383 flags, bo, slpflag, slptimeo); 2384 if (error != 0 && error != EAGAIN) { 2385 BO_UNLOCK(bo); 2386 return (error); 2387 } 2388 } while (error != 0); 2389 2390 /* 2391 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 2392 * have write I/O in-progress but if there is a VM object then the 2393 * VM object can also have read-I/O in-progress. 2394 */ 2395 do { 2396 bufobj_wwait(bo, 0, 0); 2397 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 2398 BO_UNLOCK(bo); 2399 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 2400 BO_LOCK(bo); 2401 } 2402 } while (bo->bo_numoutput > 0); 2403 BO_UNLOCK(bo); 2404 2405 /* 2406 * Destroy the copy in the VM cache, too. 2407 */ 2408 if (bo->bo_object != NULL && 2409 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 2410 VM_OBJECT_WLOCK(bo->bo_object); 2411 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 2412 OBJPR_CLEANONLY : 0); 2413 VM_OBJECT_WUNLOCK(bo->bo_object); 2414 } 2415 2416 #ifdef INVARIANTS 2417 BO_LOCK(bo); 2418 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 2419 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 2420 bo->bo_clean.bv_cnt > 0)) 2421 panic("vinvalbuf: flush failed"); 2422 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 2423 bo->bo_dirty.bv_cnt > 0) 2424 panic("vinvalbuf: flush dirty failed"); 2425 BO_UNLOCK(bo); 2426 #endif 2427 return (0); 2428 } 2429 2430 /* 2431 * Flush out and invalidate all buffers associated with a vnode. 2432 * Called with the underlying object locked. 2433 */ 2434 int 2435 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 2436 { 2437 2438 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2439 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 2440 if (vp->v_object != NULL && vp->v_object->handle != vp) 2441 return (0); 2442 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 2443 } 2444 2445 /* 2446 * Flush out buffers on the specified list. 2447 * 2448 */ 2449 static int 2450 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 2451 int slptimeo) 2452 { 2453 struct buf *bp, *nbp; 2454 int retval, error; 2455 daddr_t lblkno; 2456 b_xflags_t xflags; 2457 2458 ASSERT_BO_WLOCKED(bo); 2459 2460 retval = 0; 2461 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2462 /* 2463 * If we are flushing both V_NORMAL and V_ALT buffers then 2464 * do not skip any buffers. If we are flushing only V_NORMAL 2465 * buffers then skip buffers marked as BX_ALTDATA. If we are 2466 * flushing only V_ALT buffers then skip buffers not marked 2467 * as BX_ALTDATA. 2468 */ 2469 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2470 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2471 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2472 continue; 2473 } 2474 if (nbp != NULL) { 2475 lblkno = nbp->b_lblkno; 2476 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2477 } 2478 retval = EAGAIN; 2479 error = BUF_TIMELOCK(bp, 2480 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2481 "flushbuf", slpflag, slptimeo); 2482 if (error) { 2483 BO_LOCK(bo); 2484 return (error != ENOLCK ? error : EAGAIN); 2485 } 2486 KASSERT(bp->b_bufobj == bo, 2487 ("bp %p wrong b_bufobj %p should be %p", 2488 bp, bp->b_bufobj, bo)); 2489 /* 2490 * XXX Since there are no node locks for NFS, I 2491 * believe there is a slight chance that a delayed 2492 * write will occur while sleeping just above, so 2493 * check for it. 2494 */ 2495 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2496 (flags & V_SAVE)) { 2497 bremfree(bp); 2498 bp->b_flags |= B_ASYNC; 2499 bwrite(bp); 2500 BO_LOCK(bo); 2501 return (EAGAIN); /* XXX: why not loop ? */ 2502 } 2503 bremfree(bp); 2504 bp->b_flags |= (B_INVAL | B_RELBUF); 2505 bp->b_flags &= ~B_ASYNC; 2506 brelse(bp); 2507 BO_LOCK(bo); 2508 if (nbp == NULL) 2509 break; 2510 nbp = gbincore(bo, lblkno); 2511 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2512 != xflags) 2513 break; /* nbp invalid */ 2514 } 2515 return (retval); 2516 } 2517 2518 int 2519 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2520 { 2521 struct buf *bp; 2522 int error; 2523 daddr_t lblkno; 2524 2525 ASSERT_BO_LOCKED(bo); 2526 2527 for (lblkno = startn;;) { 2528 again: 2529 bp = buf_lookup_ge(bufv, lblkno); 2530 if (bp == NULL || bp->b_lblkno >= endn) 2531 break; 2532 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2533 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2534 if (error != 0) { 2535 BO_RLOCK(bo); 2536 if (error == ENOLCK) 2537 goto again; 2538 return (error); 2539 } 2540 KASSERT(bp->b_bufobj == bo, 2541 ("bp %p wrong b_bufobj %p should be %p", 2542 bp, bp->b_bufobj, bo)); 2543 lblkno = bp->b_lblkno + 1; 2544 if ((bp->b_flags & B_MANAGED) == 0) 2545 bremfree(bp); 2546 bp->b_flags |= B_RELBUF; 2547 /* 2548 * In the VMIO case, use the B_NOREUSE flag to hint that the 2549 * pages backing each buffer in the range are unlikely to be 2550 * reused. Dirty buffers will have the hint applied once 2551 * they've been written. 2552 */ 2553 if ((bp->b_flags & B_VMIO) != 0) 2554 bp->b_flags |= B_NOREUSE; 2555 brelse(bp); 2556 BO_RLOCK(bo); 2557 } 2558 return (0); 2559 } 2560 2561 /* 2562 * Truncate a file's buffer and pages to a specified length. This 2563 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2564 * sync activity. 2565 */ 2566 int 2567 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2568 { 2569 struct buf *bp, *nbp; 2570 struct bufobj *bo; 2571 daddr_t startlbn; 2572 2573 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2574 vp, blksize, (uintmax_t)length); 2575 2576 /* 2577 * Round up to the *next* lbn. 2578 */ 2579 startlbn = howmany(length, blksize); 2580 2581 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2582 2583 bo = &vp->v_bufobj; 2584 restart_unlocked: 2585 BO_LOCK(bo); 2586 2587 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2588 ; 2589 2590 if (length > 0) { 2591 /* 2592 * Write out vnode metadata, e.g. indirect blocks. 2593 */ 2594 restartsync: 2595 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2596 if (bp->b_lblkno >= 0) 2597 continue; 2598 /* 2599 * Since we hold the vnode lock this should only 2600 * fail if we're racing with the buf daemon. 2601 */ 2602 if (BUF_LOCK(bp, 2603 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2604 BO_LOCKPTR(bo)) == ENOLCK) 2605 goto restart_unlocked; 2606 2607 VNASSERT((bp->b_flags & B_DELWRI), vp, 2608 ("buf(%p) on dirty queue without DELWRI", bp)); 2609 2610 bremfree(bp); 2611 bawrite(bp); 2612 BO_LOCK(bo); 2613 goto restartsync; 2614 } 2615 } 2616 2617 bufobj_wwait(bo, 0, 0); 2618 BO_UNLOCK(bo); 2619 vnode_pager_setsize(vp, length); 2620 2621 return (0); 2622 } 2623 2624 /* 2625 * Invalidate the cached pages of a file's buffer within the range of block 2626 * numbers [startlbn, endlbn). 2627 */ 2628 void 2629 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2630 int blksize) 2631 { 2632 struct bufobj *bo; 2633 off_t start, end; 2634 2635 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2636 2637 start = blksize * startlbn; 2638 end = blksize * endlbn; 2639 2640 bo = &vp->v_bufobj; 2641 BO_LOCK(bo); 2642 MPASS(blksize == bo->bo_bsize); 2643 2644 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2645 ; 2646 2647 BO_UNLOCK(bo); 2648 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2649 } 2650 2651 static int 2652 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2653 daddr_t startlbn, daddr_t endlbn) 2654 { 2655 struct bufv *bv; 2656 struct buf *bp, *nbp; 2657 uint8_t anyfreed; 2658 bool clean; 2659 2660 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2661 ASSERT_BO_LOCKED(bo); 2662 2663 anyfreed = 1; 2664 clean = true; 2665 do { 2666 bv = clean ? &bo->bo_clean : &bo->bo_dirty; 2667 bp = buf_lookup_ge(bv, startlbn); 2668 if (bp == NULL) 2669 continue; 2670 TAILQ_FOREACH_FROM_SAFE(bp, &bv->bv_hd, b_bobufs, nbp) { 2671 if (bp->b_lblkno >= endlbn) 2672 break; 2673 if (BUF_LOCK(bp, 2674 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2675 BO_LOCKPTR(bo)) == ENOLCK) { 2676 BO_LOCK(bo); 2677 return (EAGAIN); 2678 } 2679 2680 bremfree(bp); 2681 bp->b_flags |= B_INVAL | B_RELBUF; 2682 bp->b_flags &= ~B_ASYNC; 2683 brelse(bp); 2684 anyfreed = 2; 2685 2686 BO_LOCK(bo); 2687 if (nbp != NULL && 2688 (((nbp->b_xflags & 2689 (clean ? BX_VNCLEAN : BX_VNDIRTY)) == 0) || 2690 nbp->b_vp != vp || 2691 (nbp->b_flags & B_DELWRI) == (clean? B_DELWRI: 0))) 2692 return (EAGAIN); 2693 } 2694 } while (clean = !clean, anyfreed-- > 0); 2695 return (0); 2696 } 2697 2698 static void 2699 buf_vlist_remove(struct buf *bp) 2700 { 2701 struct bufv *bv; 2702 b_xflags_t flags; 2703 2704 flags = bp->b_xflags; 2705 2706 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2707 ASSERT_BO_WLOCKED(bp->b_bufobj); 2708 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2709 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2710 ("%s: buffer %p has invalid queue state", __func__, bp)); 2711 2712 if ((flags & BX_VNDIRTY) != 0) 2713 bv = &bp->b_bufobj->bo_dirty; 2714 else 2715 bv = &bp->b_bufobj->bo_clean; 2716 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2717 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2718 bv->bv_cnt--; 2719 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2720 } 2721 2722 /* 2723 * Add the buffer to the sorted clean or dirty block list. Return zero on 2724 * success, EEXIST if a buffer with this identity already exists, or another 2725 * error on allocation failure. 2726 */ 2727 static inline int 2728 buf_vlist_find_or_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2729 { 2730 struct bufv *bv; 2731 struct buf *n; 2732 int error; 2733 2734 ASSERT_BO_WLOCKED(bo); 2735 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2736 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2737 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2738 ("dead bo %p", bo)); 2739 KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == xflags, 2740 ("buf_vlist_add: b_xflags %#x not set on bp %p", xflags, bp)); 2741 2742 if (xflags & BX_VNDIRTY) 2743 bv = &bo->bo_dirty; 2744 else 2745 bv = &bo->bo_clean; 2746 2747 error = buf_insert_lookup_le(bv, bp, &n); 2748 if (n == NULL) { 2749 KASSERT(error != EEXIST, 2750 ("buf_vlist_add: EEXIST but no existing buf found: bp %p", 2751 bp)); 2752 } else { 2753 KASSERT(n->b_lblkno <= bp->b_lblkno, 2754 ("buf_vlist_add: out of order insert/lookup: bp %p n %p", 2755 bp, n)); 2756 KASSERT((n->b_lblkno == bp->b_lblkno) == (error == EEXIST), 2757 ("buf_vlist_add: inconsistent result for existing buf: " 2758 "error %d bp %p n %p", error, bp, n)); 2759 } 2760 if (error != 0) 2761 return (error); 2762 2763 /* Keep the list ordered. */ 2764 if (n == NULL) { 2765 KASSERT(TAILQ_EMPTY(&bv->bv_hd) || 2766 bp->b_lblkno < TAILQ_FIRST(&bv->bv_hd)->b_lblkno, 2767 ("buf_vlist_add: queue order: " 2768 "%p should be before first %p", 2769 bp, TAILQ_FIRST(&bv->bv_hd))); 2770 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2771 } else { 2772 KASSERT(TAILQ_NEXT(n, b_bobufs) == NULL || 2773 bp->b_lblkno < TAILQ_NEXT(n, b_bobufs)->b_lblkno, 2774 ("buf_vlist_add: queue order: " 2775 "%p should be before next %p", 2776 bp, TAILQ_NEXT(n, b_bobufs))); 2777 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2778 } 2779 2780 bv->bv_cnt++; 2781 return (0); 2782 } 2783 2784 /* 2785 * Add the buffer to the sorted clean or dirty block list. 2786 * 2787 * NOTE: xflags is passed as a constant, optimizing this inline function! 2788 */ 2789 static void 2790 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2791 { 2792 int error; 2793 2794 KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0, 2795 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2796 bp->b_xflags |= xflags; 2797 error = buf_vlist_find_or_add(bp, bo, xflags); 2798 if (error) 2799 panic("buf_vlist_add: error=%d", error); 2800 } 2801 2802 /* 2803 * Look up a buffer using the buffer tries. 2804 */ 2805 struct buf * 2806 gbincore(struct bufobj *bo, daddr_t lblkno) 2807 { 2808 struct buf *bp; 2809 2810 ASSERT_BO_LOCKED(bo); 2811 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2812 if (bp != NULL) 2813 return (bp); 2814 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2815 } 2816 2817 /* 2818 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2819 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2820 * stability of the result. Like other lockless lookups, the found buf may 2821 * already be invalid by the time this function returns. 2822 */ 2823 struct buf * 2824 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2825 { 2826 struct buf *bp; 2827 2828 ASSERT_BO_UNLOCKED(bo); 2829 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2830 if (bp != NULL) 2831 return (bp); 2832 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2833 } 2834 2835 /* 2836 * Associate a buffer with a vnode. 2837 */ 2838 int 2839 bgetvp(struct vnode *vp, struct buf *bp) 2840 { 2841 struct bufobj *bo; 2842 int error; 2843 2844 bo = &vp->v_bufobj; 2845 ASSERT_BO_UNLOCKED(bo); 2846 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2847 2848 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2849 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2850 ("bgetvp: bp already attached! %p", bp)); 2851 2852 /* 2853 * Add the buf to the vnode's clean list unless we lost a race and find 2854 * an existing buf in either dirty or clean. 2855 */ 2856 bp->b_vp = vp; 2857 bp->b_bufobj = bo; 2858 bp->b_xflags |= BX_VNCLEAN; 2859 error = EEXIST; 2860 BO_LOCK(bo); 2861 if (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, bp->b_lblkno) == NULL) 2862 error = buf_vlist_find_or_add(bp, bo, BX_VNCLEAN); 2863 BO_UNLOCK(bo); 2864 if (__predict_true(error == 0)) { 2865 vhold(vp); 2866 return (0); 2867 } 2868 if (error != EEXIST) 2869 panic("bgetvp: buf_vlist_add error: %d", error); 2870 bp->b_vp = NULL; 2871 bp->b_bufobj = NULL; 2872 bp->b_xflags &= ~BX_VNCLEAN; 2873 return (error); 2874 } 2875 2876 /* 2877 * Disassociate a buffer from a vnode. 2878 */ 2879 void 2880 brelvp(struct buf *bp) 2881 { 2882 struct bufobj *bo; 2883 struct vnode *vp; 2884 2885 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2886 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2887 2888 /* 2889 * Delete from old vnode list, if on one. 2890 */ 2891 vp = bp->b_vp; /* XXX */ 2892 bo = bp->b_bufobj; 2893 BO_LOCK(bo); 2894 buf_vlist_remove(bp); 2895 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2896 bo->bo_flag &= ~BO_ONWORKLST; 2897 mtx_lock(&sync_mtx); 2898 LIST_REMOVE(bo, bo_synclist); 2899 syncer_worklist_len--; 2900 mtx_unlock(&sync_mtx); 2901 } 2902 bp->b_vp = NULL; 2903 bp->b_bufobj = NULL; 2904 BO_UNLOCK(bo); 2905 vdrop(vp); 2906 } 2907 2908 /* 2909 * Add an item to the syncer work queue. 2910 */ 2911 static void 2912 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2913 { 2914 int slot; 2915 2916 ASSERT_BO_WLOCKED(bo); 2917 2918 mtx_lock(&sync_mtx); 2919 if (bo->bo_flag & BO_ONWORKLST) 2920 LIST_REMOVE(bo, bo_synclist); 2921 else { 2922 bo->bo_flag |= BO_ONWORKLST; 2923 syncer_worklist_len++; 2924 } 2925 2926 if (delay > syncer_maxdelay - 2) 2927 delay = syncer_maxdelay - 2; 2928 slot = (syncer_delayno + delay) & syncer_mask; 2929 2930 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2931 mtx_unlock(&sync_mtx); 2932 } 2933 2934 static int 2935 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2936 { 2937 int error, len; 2938 2939 mtx_lock(&sync_mtx); 2940 len = syncer_worklist_len - sync_vnode_count; 2941 mtx_unlock(&sync_mtx); 2942 error = SYSCTL_OUT(req, &len, sizeof(len)); 2943 return (error); 2944 } 2945 2946 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2947 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2948 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2949 2950 static struct proc *updateproc; 2951 static void sched_sync(void); 2952 static struct kproc_desc up_kp = { 2953 "syncer", 2954 sched_sync, 2955 &updateproc 2956 }; 2957 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2958 2959 static int 2960 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2961 { 2962 struct vnode *vp; 2963 struct mount *mp; 2964 2965 *bo = LIST_FIRST(slp); 2966 if (*bo == NULL) 2967 return (0); 2968 vp = bo2vnode(*bo); 2969 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2970 return (1); 2971 /* 2972 * We use vhold in case the vnode does not 2973 * successfully sync. vhold prevents the vnode from 2974 * going away when we unlock the sync_mtx so that 2975 * we can acquire the vnode interlock. 2976 */ 2977 vholdl(vp); 2978 mtx_unlock(&sync_mtx); 2979 VI_UNLOCK(vp); 2980 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2981 vdrop(vp); 2982 mtx_lock(&sync_mtx); 2983 return (*bo == LIST_FIRST(slp)); 2984 } 2985 MPASSERT(mp == NULL || (curthread->td_pflags & TDP_IGNSUSP) != 0 || 2986 (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0, mp, 2987 ("suspended mp syncing vp %p", vp)); 2988 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2989 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2990 VOP_UNLOCK(vp); 2991 vn_finished_write(mp); 2992 BO_LOCK(*bo); 2993 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2994 /* 2995 * Put us back on the worklist. The worklist 2996 * routine will remove us from our current 2997 * position and then add us back in at a later 2998 * position. 2999 */ 3000 vn_syncer_add_to_worklist(*bo, syncdelay); 3001 } 3002 BO_UNLOCK(*bo); 3003 vdrop(vp); 3004 mtx_lock(&sync_mtx); 3005 return (0); 3006 } 3007 3008 static int first_printf = 1; 3009 3010 /* 3011 * System filesystem synchronizer daemon. 3012 */ 3013 static void 3014 sched_sync(void) 3015 { 3016 struct synclist *next, *slp; 3017 struct bufobj *bo; 3018 long starttime; 3019 struct thread *td = curthread; 3020 int last_work_seen; 3021 int net_worklist_len; 3022 int syncer_final_iter; 3023 int error; 3024 3025 last_work_seen = 0; 3026 syncer_final_iter = 0; 3027 syncer_state = SYNCER_RUNNING; 3028 starttime = time_uptime; 3029 td->td_pflags |= TDP_NORUNNINGBUF; 3030 3031 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 3032 SHUTDOWN_PRI_LAST); 3033 3034 mtx_lock(&sync_mtx); 3035 for (;;) { 3036 if (syncer_state == SYNCER_FINAL_DELAY && 3037 syncer_final_iter == 0) { 3038 mtx_unlock(&sync_mtx); 3039 kproc_suspend_check(td->td_proc); 3040 mtx_lock(&sync_mtx); 3041 } 3042 net_worklist_len = syncer_worklist_len - sync_vnode_count; 3043 if (syncer_state != SYNCER_RUNNING && 3044 starttime != time_uptime) { 3045 if (first_printf) { 3046 printf("\nSyncing disks, vnodes remaining... "); 3047 first_printf = 0; 3048 } 3049 printf("%d ", net_worklist_len); 3050 } 3051 starttime = time_uptime; 3052 3053 /* 3054 * Push files whose dirty time has expired. Be careful 3055 * of interrupt race on slp queue. 3056 * 3057 * Skip over empty worklist slots when shutting down. 3058 */ 3059 do { 3060 slp = &syncer_workitem_pending[syncer_delayno]; 3061 syncer_delayno += 1; 3062 if (syncer_delayno == syncer_maxdelay) 3063 syncer_delayno = 0; 3064 next = &syncer_workitem_pending[syncer_delayno]; 3065 /* 3066 * If the worklist has wrapped since the 3067 * it was emptied of all but syncer vnodes, 3068 * switch to the FINAL_DELAY state and run 3069 * for one more second. 3070 */ 3071 if (syncer_state == SYNCER_SHUTTING_DOWN && 3072 net_worklist_len == 0 && 3073 last_work_seen == syncer_delayno) { 3074 syncer_state = SYNCER_FINAL_DELAY; 3075 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 3076 } 3077 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 3078 syncer_worklist_len > 0); 3079 3080 /* 3081 * Keep track of the last time there was anything 3082 * on the worklist other than syncer vnodes. 3083 * Return to the SHUTTING_DOWN state if any 3084 * new work appears. 3085 */ 3086 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 3087 last_work_seen = syncer_delayno; 3088 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 3089 syncer_state = SYNCER_SHUTTING_DOWN; 3090 while (!LIST_EMPTY(slp)) { 3091 error = sync_vnode(slp, &bo, td); 3092 if (error == 1) { 3093 LIST_REMOVE(bo, bo_synclist); 3094 LIST_INSERT_HEAD(next, bo, bo_synclist); 3095 continue; 3096 } 3097 3098 if (first_printf == 0) { 3099 /* 3100 * Drop the sync mutex, because some watchdog 3101 * drivers need to sleep while patting 3102 */ 3103 mtx_unlock(&sync_mtx); 3104 wdog_kern_pat(WD_LASTVAL); 3105 mtx_lock(&sync_mtx); 3106 } 3107 } 3108 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 3109 syncer_final_iter--; 3110 /* 3111 * The variable rushjob allows the kernel to speed up the 3112 * processing of the filesystem syncer process. A rushjob 3113 * value of N tells the filesystem syncer to process the next 3114 * N seconds worth of work on its queue ASAP. Currently rushjob 3115 * is used by the soft update code to speed up the filesystem 3116 * syncer process when the incore state is getting so far 3117 * ahead of the disk that the kernel memory pool is being 3118 * threatened with exhaustion. 3119 */ 3120 if (rushjob > 0) { 3121 rushjob -= 1; 3122 continue; 3123 } 3124 /* 3125 * Just sleep for a short period of time between 3126 * iterations when shutting down to allow some I/O 3127 * to happen. 3128 * 3129 * If it has taken us less than a second to process the 3130 * current work, then wait. Otherwise start right over 3131 * again. We can still lose time if any single round 3132 * takes more than two seconds, but it does not really 3133 * matter as we are just trying to generally pace the 3134 * filesystem activity. 3135 */ 3136 if (syncer_state != SYNCER_RUNNING || 3137 time_uptime == starttime) { 3138 thread_lock(td); 3139 sched_prio(td, PPAUSE); 3140 thread_unlock(td); 3141 } 3142 if (syncer_state != SYNCER_RUNNING) 3143 cv_timedwait(&sync_wakeup, &sync_mtx, 3144 hz / SYNCER_SHUTDOWN_SPEEDUP); 3145 else if (time_uptime == starttime) 3146 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 3147 } 3148 } 3149 3150 /* 3151 * Request the syncer daemon to speed up its work. 3152 * We never push it to speed up more than half of its 3153 * normal turn time, otherwise it could take over the cpu. 3154 */ 3155 int 3156 speedup_syncer(void) 3157 { 3158 int ret = 0; 3159 3160 mtx_lock(&sync_mtx); 3161 if (rushjob < syncdelay / 2) { 3162 rushjob += 1; 3163 stat_rush_requests += 1; 3164 ret = 1; 3165 } 3166 mtx_unlock(&sync_mtx); 3167 cv_broadcast(&sync_wakeup); 3168 return (ret); 3169 } 3170 3171 /* 3172 * Tell the syncer to speed up its work and run though its work 3173 * list several times, then tell it to shut down. 3174 */ 3175 static void 3176 syncer_shutdown(void *arg, int howto) 3177 { 3178 3179 if (howto & RB_NOSYNC) 3180 return; 3181 mtx_lock(&sync_mtx); 3182 syncer_state = SYNCER_SHUTTING_DOWN; 3183 rushjob = 0; 3184 mtx_unlock(&sync_mtx); 3185 cv_broadcast(&sync_wakeup); 3186 kproc_shutdown(arg, howto); 3187 } 3188 3189 void 3190 syncer_suspend(void) 3191 { 3192 3193 syncer_shutdown(updateproc, 0); 3194 } 3195 3196 void 3197 syncer_resume(void) 3198 { 3199 3200 mtx_lock(&sync_mtx); 3201 first_printf = 1; 3202 syncer_state = SYNCER_RUNNING; 3203 mtx_unlock(&sync_mtx); 3204 cv_broadcast(&sync_wakeup); 3205 kproc_resume(updateproc); 3206 } 3207 3208 /* 3209 * Move the buffer between the clean and dirty lists of its vnode. 3210 */ 3211 void 3212 reassignbuf(struct buf *bp) 3213 { 3214 struct vnode *vp; 3215 struct bufobj *bo; 3216 int delay; 3217 #ifdef INVARIANTS 3218 struct bufv *bv; 3219 #endif 3220 3221 vp = bp->b_vp; 3222 bo = bp->b_bufobj; 3223 3224 KASSERT((bp->b_flags & B_PAGING) == 0, 3225 ("%s: cannot reassign paging buffer %p", __func__, bp)); 3226 3227 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 3228 bp, bp->b_vp, bp->b_flags); 3229 3230 BO_LOCK(bo); 3231 if ((bo->bo_flag & BO_NONSTERILE) == 0) { 3232 /* 3233 * Coordinate with getblk's unlocked lookup. Make 3234 * BO_NONSTERILE visible before the first reassignbuf produces 3235 * any side effect. This could be outside the bo lock if we 3236 * used a separate atomic flag field. 3237 */ 3238 bo->bo_flag |= BO_NONSTERILE; 3239 atomic_thread_fence_rel(); 3240 } 3241 buf_vlist_remove(bp); 3242 3243 /* 3244 * If dirty, put on list of dirty buffers; otherwise insert onto list 3245 * of clean buffers. 3246 */ 3247 if (bp->b_flags & B_DELWRI) { 3248 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 3249 switch (vp->v_type) { 3250 case VDIR: 3251 delay = dirdelay; 3252 break; 3253 case VCHR: 3254 delay = metadelay; 3255 break; 3256 default: 3257 delay = filedelay; 3258 } 3259 vn_syncer_add_to_worklist(bo, delay); 3260 } 3261 buf_vlist_add(bp, bo, BX_VNDIRTY); 3262 } else { 3263 buf_vlist_add(bp, bo, BX_VNCLEAN); 3264 3265 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 3266 mtx_lock(&sync_mtx); 3267 LIST_REMOVE(bo, bo_synclist); 3268 syncer_worklist_len--; 3269 mtx_unlock(&sync_mtx); 3270 bo->bo_flag &= ~BO_ONWORKLST; 3271 } 3272 } 3273 #ifdef INVARIANTS 3274 bv = &bo->bo_clean; 3275 bp = TAILQ_FIRST(&bv->bv_hd); 3276 KASSERT(bp == NULL || bp->b_bufobj == bo, 3277 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3278 bp = TAILQ_LAST(&bv->bv_hd, buflists); 3279 KASSERT(bp == NULL || bp->b_bufobj == bo, 3280 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3281 bv = &bo->bo_dirty; 3282 bp = TAILQ_FIRST(&bv->bv_hd); 3283 KASSERT(bp == NULL || bp->b_bufobj == bo, 3284 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3285 bp = TAILQ_LAST(&bv->bv_hd, buflists); 3286 KASSERT(bp == NULL || bp->b_bufobj == bo, 3287 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3288 #endif 3289 BO_UNLOCK(bo); 3290 } 3291 3292 static void 3293 v_init_counters(struct vnode *vp) 3294 { 3295 3296 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 3297 vp, ("%s called for an initialized vnode", __FUNCTION__)); 3298 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 3299 3300 refcount_init(&vp->v_holdcnt, 1); 3301 refcount_init(&vp->v_usecount, 1); 3302 } 3303 3304 /* 3305 * Get a usecount on a vnode. 3306 * 3307 * vget and vget_finish may fail to lock the vnode if they lose a race against 3308 * it being doomed. LK_RETRY can be passed in flags to lock it anyway. 3309 * 3310 * Consumers which don't guarantee liveness of the vnode can use SMR to 3311 * try to get a reference. Note this operation can fail since the vnode 3312 * may be awaiting getting freed by the time they get to it. 3313 */ 3314 enum vgetstate 3315 vget_prep_smr(struct vnode *vp) 3316 { 3317 enum vgetstate vs; 3318 3319 VFS_SMR_ASSERT_ENTERED(); 3320 3321 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3322 vs = VGET_USECOUNT; 3323 } else { 3324 if (vhold_smr(vp)) 3325 vs = VGET_HOLDCNT; 3326 else 3327 vs = VGET_NONE; 3328 } 3329 return (vs); 3330 } 3331 3332 enum vgetstate 3333 vget_prep(struct vnode *vp) 3334 { 3335 enum vgetstate vs; 3336 3337 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3338 vs = VGET_USECOUNT; 3339 } else { 3340 vhold(vp); 3341 vs = VGET_HOLDCNT; 3342 } 3343 return (vs); 3344 } 3345 3346 void 3347 vget_abort(struct vnode *vp, enum vgetstate vs) 3348 { 3349 3350 switch (vs) { 3351 case VGET_USECOUNT: 3352 vrele(vp); 3353 break; 3354 case VGET_HOLDCNT: 3355 vdrop(vp); 3356 break; 3357 default: 3358 __assert_unreachable(); 3359 } 3360 } 3361 3362 int 3363 vget(struct vnode *vp, int flags) 3364 { 3365 enum vgetstate vs; 3366 3367 vs = vget_prep(vp); 3368 return (vget_finish(vp, flags, vs)); 3369 } 3370 3371 int 3372 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 3373 { 3374 int error; 3375 3376 if ((flags & LK_INTERLOCK) != 0) 3377 ASSERT_VI_LOCKED(vp, __func__); 3378 else 3379 ASSERT_VI_UNLOCKED(vp, __func__); 3380 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3381 VNPASS(vp->v_holdcnt > 0, vp); 3382 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3383 3384 error = vn_lock(vp, flags); 3385 if (__predict_false(error != 0)) { 3386 vget_abort(vp, vs); 3387 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 3388 vp); 3389 return (error); 3390 } 3391 3392 vget_finish_ref(vp, vs); 3393 return (0); 3394 } 3395 3396 void 3397 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 3398 { 3399 int old; 3400 3401 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3402 VNPASS(vp->v_holdcnt > 0, vp); 3403 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3404 3405 if (vs == VGET_USECOUNT) 3406 return; 3407 3408 /* 3409 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3410 * the vnode around. Otherwise someone else lended their hold count and 3411 * we have to drop ours. 3412 */ 3413 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3414 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3415 if (old != 0) { 3416 #ifdef INVARIANTS 3417 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3418 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3419 #else 3420 refcount_release(&vp->v_holdcnt); 3421 #endif 3422 } 3423 } 3424 3425 void 3426 vref(struct vnode *vp) 3427 { 3428 enum vgetstate vs; 3429 3430 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3431 vs = vget_prep(vp); 3432 vget_finish_ref(vp, vs); 3433 } 3434 3435 void 3436 vrefact(struct vnode *vp) 3437 { 3438 int old __diagused; 3439 3440 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3441 old = refcount_acquire(&vp->v_usecount); 3442 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3443 } 3444 3445 void 3446 vlazy(struct vnode *vp) 3447 { 3448 struct mount *mp; 3449 3450 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3451 3452 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3453 return; 3454 /* 3455 * We may get here for inactive routines after the vnode got doomed. 3456 */ 3457 if (VN_IS_DOOMED(vp)) 3458 return; 3459 mp = vp->v_mount; 3460 mtx_lock(&mp->mnt_listmtx); 3461 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3462 vp->v_mflag |= VMP_LAZYLIST; 3463 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3464 mp->mnt_lazyvnodelistsize++; 3465 } 3466 mtx_unlock(&mp->mnt_listmtx); 3467 } 3468 3469 static void 3470 vunlazy(struct vnode *vp) 3471 { 3472 struct mount *mp; 3473 3474 ASSERT_VI_LOCKED(vp, __func__); 3475 VNPASS(!VN_IS_DOOMED(vp), vp); 3476 3477 mp = vp->v_mount; 3478 mtx_lock(&mp->mnt_listmtx); 3479 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3480 /* 3481 * Don't remove the vnode from the lazy list if another thread 3482 * has increased the hold count. It may have re-enqueued the 3483 * vnode to the lazy list and is now responsible for its 3484 * removal. 3485 */ 3486 if (vp->v_holdcnt == 0) { 3487 vp->v_mflag &= ~VMP_LAZYLIST; 3488 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3489 mp->mnt_lazyvnodelistsize--; 3490 } 3491 mtx_unlock(&mp->mnt_listmtx); 3492 } 3493 3494 /* 3495 * This routine is only meant to be called from vgonel prior to dooming 3496 * the vnode. 3497 */ 3498 static void 3499 vunlazy_gone(struct vnode *vp) 3500 { 3501 struct mount *mp; 3502 3503 ASSERT_VOP_ELOCKED(vp, __func__); 3504 ASSERT_VI_LOCKED(vp, __func__); 3505 VNPASS(!VN_IS_DOOMED(vp), vp); 3506 3507 if (vp->v_mflag & VMP_LAZYLIST) { 3508 mp = vp->v_mount; 3509 mtx_lock(&mp->mnt_listmtx); 3510 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3511 vp->v_mflag &= ~VMP_LAZYLIST; 3512 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3513 mp->mnt_lazyvnodelistsize--; 3514 mtx_unlock(&mp->mnt_listmtx); 3515 } 3516 } 3517 3518 static void 3519 vdefer_inactive(struct vnode *vp) 3520 { 3521 3522 ASSERT_VI_LOCKED(vp, __func__); 3523 VNPASS(vp->v_holdcnt > 0, vp); 3524 if (VN_IS_DOOMED(vp)) { 3525 vdropl(vp); 3526 return; 3527 } 3528 if (vp->v_iflag & VI_DEFINACT) { 3529 VNPASS(vp->v_holdcnt > 1, vp); 3530 vdropl(vp); 3531 return; 3532 } 3533 if (vp->v_usecount > 0) { 3534 vp->v_iflag &= ~VI_OWEINACT; 3535 vdropl(vp); 3536 return; 3537 } 3538 vlazy(vp); 3539 vp->v_iflag |= VI_DEFINACT; 3540 VI_UNLOCK(vp); 3541 atomic_add_long(&deferred_inact, 1); 3542 } 3543 3544 static void 3545 vdefer_inactive_unlocked(struct vnode *vp) 3546 { 3547 3548 VI_LOCK(vp); 3549 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3550 vdropl(vp); 3551 return; 3552 } 3553 vdefer_inactive(vp); 3554 } 3555 3556 enum vput_op { VRELE, VPUT, VUNREF }; 3557 3558 /* 3559 * Handle ->v_usecount transitioning to 0. 3560 * 3561 * By releasing the last usecount we take ownership of the hold count which 3562 * provides liveness of the vnode, meaning we have to vdrop. 3563 * 3564 * For all vnodes we may need to perform inactive processing. It requires an 3565 * exclusive lock on the vnode, while it is legal to call here with only a 3566 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3567 * inactive processing gets deferred to the syncer. 3568 * 3569 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3570 * on the lock being held all the way until VOP_INACTIVE. This in particular 3571 * happens with UFS which adds half-constructed vnodes to the hash, where they 3572 * can be found by other code. 3573 */ 3574 static void 3575 vput_final(struct vnode *vp, enum vput_op func) 3576 { 3577 int error; 3578 bool want_unlock; 3579 3580 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3581 VNPASS(vp->v_holdcnt > 0, vp); 3582 3583 VI_LOCK(vp); 3584 3585 /* 3586 * By the time we got here someone else might have transitioned 3587 * the count back to > 0. 3588 */ 3589 if (vp->v_usecount > 0) 3590 goto out; 3591 3592 /* 3593 * If the vnode is doomed vgone already performed inactive processing 3594 * (if needed). 3595 */ 3596 if (VN_IS_DOOMED(vp)) 3597 goto out; 3598 3599 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3600 goto out; 3601 3602 if (vp->v_iflag & VI_DOINGINACT) 3603 goto out; 3604 3605 /* 3606 * Locking operations here will drop the interlock and possibly the 3607 * vnode lock, opening a window where the vnode can get doomed all the 3608 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3609 * perform inactive. 3610 */ 3611 vp->v_iflag |= VI_OWEINACT; 3612 want_unlock = false; 3613 error = 0; 3614 switch (func) { 3615 case VRELE: 3616 switch (VOP_ISLOCKED(vp)) { 3617 case LK_EXCLUSIVE: 3618 break; 3619 case LK_EXCLOTHER: 3620 case 0: 3621 want_unlock = true; 3622 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3623 VI_LOCK(vp); 3624 break; 3625 default: 3626 /* 3627 * The lock has at least one sharer, but we have no way 3628 * to conclude whether this is us. Play it safe and 3629 * defer processing. 3630 */ 3631 error = EAGAIN; 3632 break; 3633 } 3634 break; 3635 case VPUT: 3636 want_unlock = true; 3637 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3638 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3639 LK_NOWAIT); 3640 VI_LOCK(vp); 3641 } 3642 break; 3643 case VUNREF: 3644 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3645 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3646 VI_LOCK(vp); 3647 } 3648 break; 3649 } 3650 if (error == 0) { 3651 if (func == VUNREF) { 3652 VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, 3653 ("recursive vunref")); 3654 vp->v_vflag |= VV_UNREF; 3655 } 3656 for (;;) { 3657 error = vinactive(vp); 3658 if (want_unlock) 3659 VOP_UNLOCK(vp); 3660 if (error != ERELOOKUP || !want_unlock) 3661 break; 3662 VOP_LOCK(vp, LK_EXCLUSIVE); 3663 } 3664 if (func == VUNREF) 3665 vp->v_vflag &= ~VV_UNREF; 3666 vdropl(vp); 3667 } else { 3668 vdefer_inactive(vp); 3669 } 3670 return; 3671 out: 3672 if (func == VPUT) 3673 VOP_UNLOCK(vp); 3674 vdropl(vp); 3675 } 3676 3677 /* 3678 * Decrement ->v_usecount for a vnode. 3679 * 3680 * Releasing the last use count requires additional processing, see vput_final 3681 * above for details. 3682 * 3683 * Comment above each variant denotes lock state on entry and exit. 3684 */ 3685 3686 /* 3687 * in: any 3688 * out: same as passed in 3689 */ 3690 void 3691 vrele(struct vnode *vp) 3692 { 3693 3694 ASSERT_VI_UNLOCKED(vp, __func__); 3695 if (!refcount_release(&vp->v_usecount)) 3696 return; 3697 vput_final(vp, VRELE); 3698 } 3699 3700 /* 3701 * in: locked 3702 * out: unlocked 3703 */ 3704 void 3705 vput(struct vnode *vp) 3706 { 3707 3708 ASSERT_VOP_LOCKED(vp, __func__); 3709 ASSERT_VI_UNLOCKED(vp, __func__); 3710 if (!refcount_release(&vp->v_usecount)) { 3711 VOP_UNLOCK(vp); 3712 return; 3713 } 3714 vput_final(vp, VPUT); 3715 } 3716 3717 /* 3718 * in: locked 3719 * out: locked 3720 */ 3721 void 3722 vunref(struct vnode *vp) 3723 { 3724 3725 ASSERT_VOP_LOCKED(vp, __func__); 3726 ASSERT_VI_UNLOCKED(vp, __func__); 3727 if (!refcount_release(&vp->v_usecount)) 3728 return; 3729 vput_final(vp, VUNREF); 3730 } 3731 3732 void 3733 vhold(struct vnode *vp) 3734 { 3735 int old; 3736 3737 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3738 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3739 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3740 ("%s: wrong hold count %d", __func__, old)); 3741 if (old == 0) 3742 vfs_freevnodes_dec(); 3743 } 3744 3745 void 3746 vholdnz(struct vnode *vp) 3747 { 3748 3749 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3750 #ifdef INVARIANTS 3751 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3752 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3753 ("%s: wrong hold count %d", __func__, old)); 3754 #else 3755 atomic_add_int(&vp->v_holdcnt, 1); 3756 #endif 3757 } 3758 3759 /* 3760 * Grab a hold count unless the vnode is freed. 3761 * 3762 * Only use this routine if vfs smr is the only protection you have against 3763 * freeing the vnode. 3764 * 3765 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3766 * is not set. After the flag is set the vnode becomes immutable to anyone but 3767 * the thread which managed to set the flag. 3768 * 3769 * It may be tempting to replace the loop with: 3770 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3771 * if (count & VHOLD_NO_SMR) { 3772 * backpedal and error out; 3773 * } 3774 * 3775 * However, while this is more performant, it hinders debugging by eliminating 3776 * the previously mentioned invariant. 3777 */ 3778 bool 3779 vhold_smr(struct vnode *vp) 3780 { 3781 int count; 3782 3783 VFS_SMR_ASSERT_ENTERED(); 3784 3785 count = atomic_load_int(&vp->v_holdcnt); 3786 for (;;) { 3787 if (count & VHOLD_NO_SMR) { 3788 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3789 ("non-zero hold count with flags %d\n", count)); 3790 return (false); 3791 } 3792 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3793 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3794 if (count == 0) 3795 vfs_freevnodes_dec(); 3796 return (true); 3797 } 3798 } 3799 } 3800 3801 /* 3802 * Hold a free vnode for recycling. 3803 * 3804 * Note: vnode_init references this comment. 3805 * 3806 * Attempts to recycle only need the global vnode list lock and have no use for 3807 * SMR. 3808 * 3809 * However, vnodes get inserted into the global list before they get fully 3810 * initialized and stay there until UMA decides to free the memory. This in 3811 * particular means the target can be found before it becomes usable and after 3812 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to 3813 * VHOLD_NO_SMR. 3814 * 3815 * Note: the vnode may gain more references after we transition the count 0->1. 3816 */ 3817 static bool 3818 vhold_recycle_free(struct vnode *vp) 3819 { 3820 int count; 3821 3822 mtx_assert(&vnode_list_mtx, MA_OWNED); 3823 3824 count = atomic_load_int(&vp->v_holdcnt); 3825 for (;;) { 3826 if (count & VHOLD_NO_SMR) { 3827 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3828 ("non-zero hold count with flags %d\n", count)); 3829 return (false); 3830 } 3831 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3832 if (count > 0) { 3833 return (false); 3834 } 3835 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3836 vfs_freevnodes_dec(); 3837 return (true); 3838 } 3839 } 3840 } 3841 3842 static void __noinline 3843 vdbatch_process(struct vdbatch *vd) 3844 { 3845 struct vnode *vp; 3846 int i; 3847 3848 mtx_assert(&vd->lock, MA_OWNED); 3849 MPASS(curthread->td_pinned > 0); 3850 MPASS(vd->index == VDBATCH_SIZE); 3851 3852 /* 3853 * Attempt to requeue the passed batch, but give up easily. 3854 * 3855 * Despite batching the mechanism is prone to transient *significant* 3856 * lock contention, where vnode_list_mtx becomes the primary bottleneck 3857 * if multiple CPUs get here (one real-world example is highly parallel 3858 * do-nothing make , which will stat *tons* of vnodes). Since it is 3859 * quasi-LRU (read: not that great even if fully honoured) provide an 3860 * option to just dodge the problem. Parties which don't like it are 3861 * welcome to implement something better. 3862 */ 3863 if (vnode_can_skip_requeue) { 3864 if (!mtx_trylock(&vnode_list_mtx)) { 3865 counter_u64_add(vnode_skipped_requeues, 1); 3866 critical_enter(); 3867 for (i = 0; i < VDBATCH_SIZE; i++) { 3868 vp = vd->tab[i]; 3869 vd->tab[i] = NULL; 3870 MPASS(vp->v_dbatchcpu != NOCPU); 3871 vp->v_dbatchcpu = NOCPU; 3872 } 3873 vd->index = 0; 3874 critical_exit(); 3875 return; 3876 3877 } 3878 /* fallthrough to locked processing */ 3879 } else { 3880 mtx_lock(&vnode_list_mtx); 3881 } 3882 3883 mtx_assert(&vnode_list_mtx, MA_OWNED); 3884 critical_enter(); 3885 for (i = 0; i < VDBATCH_SIZE; i++) { 3886 vp = vd->tab[i]; 3887 vd->tab[i] = NULL; 3888 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3889 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3890 MPASS(vp->v_dbatchcpu != NOCPU); 3891 vp->v_dbatchcpu = NOCPU; 3892 } 3893 mtx_unlock(&vnode_list_mtx); 3894 vd->index = 0; 3895 critical_exit(); 3896 } 3897 3898 static void 3899 vdbatch_enqueue(struct vnode *vp) 3900 { 3901 struct vdbatch *vd; 3902 3903 ASSERT_VI_LOCKED(vp, __func__); 3904 VNPASS(!VN_IS_DOOMED(vp), vp); 3905 3906 if (vp->v_dbatchcpu != NOCPU) { 3907 VI_UNLOCK(vp); 3908 return; 3909 } 3910 3911 sched_pin(); 3912 vd = DPCPU_PTR(vd); 3913 mtx_lock(&vd->lock); 3914 MPASS(vd->index < VDBATCH_SIZE); 3915 MPASS(vd->tab[vd->index] == NULL); 3916 /* 3917 * A hack: we depend on being pinned so that we know what to put in 3918 * ->v_dbatchcpu. 3919 */ 3920 vp->v_dbatchcpu = curcpu; 3921 vd->tab[vd->index] = vp; 3922 vd->index++; 3923 VI_UNLOCK(vp); 3924 if (vd->index == VDBATCH_SIZE) 3925 vdbatch_process(vd); 3926 mtx_unlock(&vd->lock); 3927 sched_unpin(); 3928 } 3929 3930 /* 3931 * This routine must only be called for vnodes which are about to be 3932 * deallocated. Supporting dequeue for arbitrary vndoes would require 3933 * validating that the locked batch matches. 3934 */ 3935 static void 3936 vdbatch_dequeue(struct vnode *vp) 3937 { 3938 struct vdbatch *vd; 3939 int i; 3940 short cpu; 3941 3942 VNPASS(vp->v_type == VBAD || vp->v_type == VNON, vp); 3943 3944 cpu = vp->v_dbatchcpu; 3945 if (cpu == NOCPU) 3946 return; 3947 3948 vd = DPCPU_ID_PTR(cpu, vd); 3949 mtx_lock(&vd->lock); 3950 for (i = 0; i < vd->index; i++) { 3951 if (vd->tab[i] != vp) 3952 continue; 3953 vp->v_dbatchcpu = NOCPU; 3954 vd->index--; 3955 vd->tab[i] = vd->tab[vd->index]; 3956 vd->tab[vd->index] = NULL; 3957 break; 3958 } 3959 mtx_unlock(&vd->lock); 3960 /* 3961 * Either we dequeued the vnode above or the target CPU beat us to it. 3962 */ 3963 MPASS(vp->v_dbatchcpu == NOCPU); 3964 } 3965 3966 /* 3967 * Drop the hold count of the vnode. 3968 * 3969 * It will only get freed if this is the last hold *and* it has been vgone'd. 3970 * 3971 * Because the vnode vm object keeps a hold reference on the vnode if 3972 * there is at least one resident non-cached page, the vnode cannot 3973 * leave the active list without the page cleanup done. 3974 */ 3975 static void __noinline 3976 vdropl_final(struct vnode *vp) 3977 { 3978 3979 ASSERT_VI_LOCKED(vp, __func__); 3980 VNPASS(VN_IS_DOOMED(vp), vp); 3981 /* 3982 * Set the VHOLD_NO_SMR flag. 3983 * 3984 * We may be racing against vhold_smr. If they win we can just pretend 3985 * we never got this far, they will vdrop later. 3986 */ 3987 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { 3988 vfs_freevnodes_inc(); 3989 VI_UNLOCK(vp); 3990 /* 3991 * We lost the aforementioned race. Any subsequent access is 3992 * invalid as they might have managed to vdropl on their own. 3993 */ 3994 return; 3995 } 3996 /* 3997 * Don't bump freevnodes as this one is going away. 3998 */ 3999 freevnode(vp); 4000 } 4001 4002 void 4003 vdrop(struct vnode *vp) 4004 { 4005 4006 ASSERT_VI_UNLOCKED(vp, __func__); 4007 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4008 if (refcount_release_if_not_last(&vp->v_holdcnt)) 4009 return; 4010 VI_LOCK(vp); 4011 vdropl(vp); 4012 } 4013 4014 static __always_inline void 4015 vdropl_impl(struct vnode *vp, bool enqueue) 4016 { 4017 4018 ASSERT_VI_LOCKED(vp, __func__); 4019 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4020 if (!refcount_release(&vp->v_holdcnt)) { 4021 VI_UNLOCK(vp); 4022 return; 4023 } 4024 VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); 4025 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 4026 if (VN_IS_DOOMED(vp)) { 4027 vdropl_final(vp); 4028 return; 4029 } 4030 4031 vfs_freevnodes_inc(); 4032 if (vp->v_mflag & VMP_LAZYLIST) { 4033 vunlazy(vp); 4034 } 4035 4036 if (!enqueue) { 4037 VI_UNLOCK(vp); 4038 return; 4039 } 4040 4041 /* 4042 * Also unlocks the interlock. We can't assert on it as we 4043 * released our hold and by now the vnode might have been 4044 * freed. 4045 */ 4046 vdbatch_enqueue(vp); 4047 } 4048 4049 void 4050 vdropl(struct vnode *vp) 4051 { 4052 4053 vdropl_impl(vp, true); 4054 } 4055 4056 /* 4057 * vdrop a vnode when recycling 4058 * 4059 * This is a special case routine only to be used when recycling, differs from 4060 * regular vdrop by not requeieing the vnode on LRU. 4061 * 4062 * Consider a case where vtryrecycle continuously fails with all vnodes (due to 4063 * e.g., frozen writes on the filesystem), filling the batch and causing it to 4064 * be requeued. Then vnlru will end up revisiting the same vnodes. This is a 4065 * loop which can last for as long as writes are frozen. 4066 */ 4067 static void 4068 vdropl_recycle(struct vnode *vp) 4069 { 4070 4071 vdropl_impl(vp, false); 4072 } 4073 4074 static void 4075 vdrop_recycle(struct vnode *vp) 4076 { 4077 4078 VI_LOCK(vp); 4079 vdropl_recycle(vp); 4080 } 4081 4082 /* 4083 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 4084 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 4085 */ 4086 static int 4087 vinactivef(struct vnode *vp) 4088 { 4089 int error; 4090 4091 ASSERT_VOP_ELOCKED(vp, "vinactive"); 4092 ASSERT_VI_LOCKED(vp, "vinactive"); 4093 VNPASS((vp->v_iflag & VI_DOINGINACT) == 0, vp); 4094 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4095 vp->v_iflag |= VI_DOINGINACT; 4096 vp->v_iflag &= ~VI_OWEINACT; 4097 VI_UNLOCK(vp); 4098 4099 /* 4100 * Before moving off the active list, we must be sure that any 4101 * modified pages are converted into the vnode's dirty 4102 * buffers, since these will no longer be checked once the 4103 * vnode is on the inactive list. 4104 * 4105 * The write-out of the dirty pages is asynchronous. At the 4106 * point that VOP_INACTIVE() is called, there could still be 4107 * pending I/O and dirty pages in the object. 4108 */ 4109 if ((vp->v_vflag & VV_NOSYNC) == 0) 4110 vnode_pager_clean_async(vp); 4111 4112 error = VOP_INACTIVE(vp); 4113 VI_LOCK(vp); 4114 VNPASS(vp->v_iflag & VI_DOINGINACT, vp); 4115 vp->v_iflag &= ~VI_DOINGINACT; 4116 return (error); 4117 } 4118 4119 int 4120 vinactive(struct vnode *vp) 4121 { 4122 4123 ASSERT_VOP_ELOCKED(vp, "vinactive"); 4124 ASSERT_VI_LOCKED(vp, "vinactive"); 4125 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4126 4127 if ((vp->v_iflag & VI_OWEINACT) == 0) 4128 return (0); 4129 if (vp->v_iflag & VI_DOINGINACT) 4130 return (0); 4131 if (vp->v_usecount > 0) { 4132 vp->v_iflag &= ~VI_OWEINACT; 4133 return (0); 4134 } 4135 return (vinactivef(vp)); 4136 } 4137 4138 /* 4139 * Remove any vnodes in the vnode table belonging to mount point mp. 4140 * 4141 * If FORCECLOSE is not specified, there should not be any active ones, 4142 * return error if any are found (nb: this is a user error, not a 4143 * system error). If FORCECLOSE is specified, detach any active vnodes 4144 * that are found. 4145 * 4146 * If WRITECLOSE is set, only flush out regular file vnodes open for 4147 * writing. 4148 * 4149 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 4150 * 4151 * `rootrefs' specifies the base reference count for the root vnode 4152 * of this filesystem. The root vnode is considered busy if its 4153 * v_usecount exceeds this value. On a successful return, vflush(, td) 4154 * will call vrele() on the root vnode exactly rootrefs times. 4155 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 4156 * be zero. 4157 */ 4158 #ifdef DIAGNOSTIC 4159 static int busyprt = 0; /* print out busy vnodes */ 4160 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 4161 #endif 4162 4163 int 4164 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 4165 { 4166 struct vnode *vp, *mvp, *rootvp = NULL; 4167 struct vattr vattr; 4168 int busy = 0, error; 4169 4170 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 4171 rootrefs, flags); 4172 if (rootrefs > 0) { 4173 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 4174 ("vflush: bad args")); 4175 /* 4176 * Get the filesystem root vnode. We can vput() it 4177 * immediately, since with rootrefs > 0, it won't go away. 4178 */ 4179 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 4180 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 4181 __func__, error); 4182 return (error); 4183 } 4184 vput(rootvp); 4185 } 4186 loop: 4187 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 4188 vholdl(vp); 4189 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 4190 if (error) { 4191 vdrop(vp); 4192 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 4193 goto loop; 4194 } 4195 /* 4196 * Skip over a vnodes marked VV_SYSTEM. 4197 */ 4198 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 4199 VOP_UNLOCK(vp); 4200 vdrop(vp); 4201 continue; 4202 } 4203 /* 4204 * If WRITECLOSE is set, flush out unlinked but still open 4205 * files (even if open only for reading) and regular file 4206 * vnodes open for writing. 4207 */ 4208 if (flags & WRITECLOSE) { 4209 vnode_pager_clean_async(vp); 4210 do { 4211 error = VOP_FSYNC(vp, MNT_WAIT, td); 4212 } while (error == ERELOOKUP); 4213 if (error != 0) { 4214 VOP_UNLOCK(vp); 4215 vdrop(vp); 4216 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 4217 return (error); 4218 } 4219 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 4220 VI_LOCK(vp); 4221 4222 if ((vp->v_type == VNON || 4223 (error == 0 && vattr.va_nlink > 0)) && 4224 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 4225 VOP_UNLOCK(vp); 4226 vdropl(vp); 4227 continue; 4228 } 4229 } else 4230 VI_LOCK(vp); 4231 /* 4232 * With v_usecount == 0, all we need to do is clear out the 4233 * vnode data structures and we are done. 4234 * 4235 * If FORCECLOSE is set, forcibly close the vnode. 4236 */ 4237 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 4238 vgonel(vp); 4239 } else { 4240 busy++; 4241 #ifdef DIAGNOSTIC 4242 if (busyprt) 4243 vn_printf(vp, "vflush: busy vnode "); 4244 #endif 4245 } 4246 VOP_UNLOCK(vp); 4247 vdropl(vp); 4248 } 4249 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 4250 /* 4251 * If just the root vnode is busy, and if its refcount 4252 * is equal to `rootrefs', then go ahead and kill it. 4253 */ 4254 VI_LOCK(rootvp); 4255 KASSERT(busy > 0, ("vflush: not busy")); 4256 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 4257 ("vflush: usecount %d < rootrefs %d", 4258 rootvp->v_usecount, rootrefs)); 4259 if (busy == 1 && rootvp->v_usecount == rootrefs) { 4260 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 4261 vgone(rootvp); 4262 VOP_UNLOCK(rootvp); 4263 busy = 0; 4264 } else 4265 VI_UNLOCK(rootvp); 4266 } 4267 if (busy) { 4268 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 4269 busy); 4270 return (EBUSY); 4271 } 4272 for (; rootrefs > 0; rootrefs--) 4273 vrele(rootvp); 4274 return (0); 4275 } 4276 4277 /* 4278 * Recycle an unused vnode. 4279 */ 4280 int 4281 vrecycle(struct vnode *vp) 4282 { 4283 int recycled; 4284 4285 VI_LOCK(vp); 4286 recycled = vrecyclel(vp); 4287 VI_UNLOCK(vp); 4288 return (recycled); 4289 } 4290 4291 /* 4292 * vrecycle, with the vp interlock held. 4293 */ 4294 int 4295 vrecyclel(struct vnode *vp) 4296 { 4297 int recycled; 4298 4299 ASSERT_VOP_ELOCKED(vp, __func__); 4300 ASSERT_VI_LOCKED(vp, __func__); 4301 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4302 recycled = 0; 4303 if (vp->v_usecount == 0) { 4304 recycled = 1; 4305 vgonel(vp); 4306 } 4307 return (recycled); 4308 } 4309 4310 /* 4311 * Eliminate all activity associated with a vnode 4312 * in preparation for reuse. 4313 */ 4314 void 4315 vgone(struct vnode *vp) 4316 { 4317 VI_LOCK(vp); 4318 vgonel(vp); 4319 VI_UNLOCK(vp); 4320 } 4321 4322 /* 4323 * Notify upper mounts about reclaimed or unlinked vnode. 4324 */ 4325 void 4326 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) 4327 { 4328 struct mount *mp; 4329 struct mount_upper_node *ump; 4330 4331 mp = atomic_load_ptr(&vp->v_mount); 4332 if (mp == NULL) 4333 return; 4334 if (TAILQ_EMPTY(&mp->mnt_notify)) 4335 return; 4336 4337 MNT_ILOCK(mp); 4338 mp->mnt_upper_pending++; 4339 KASSERT(mp->mnt_upper_pending > 0, 4340 ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); 4341 TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { 4342 MNT_IUNLOCK(mp); 4343 switch (event) { 4344 case VFS_NOTIFY_UPPER_RECLAIM: 4345 VFS_RECLAIM_LOWERVP(ump->mp, vp); 4346 break; 4347 case VFS_NOTIFY_UPPER_UNLINK: 4348 VFS_UNLINK_LOWERVP(ump->mp, vp); 4349 break; 4350 } 4351 MNT_ILOCK(mp); 4352 } 4353 mp->mnt_upper_pending--; 4354 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 4355 mp->mnt_upper_pending == 0) { 4356 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 4357 wakeup(&mp->mnt_uppers); 4358 } 4359 MNT_IUNLOCK(mp); 4360 } 4361 4362 /* 4363 * vgone, with the vp interlock held. 4364 */ 4365 static void 4366 vgonel(struct vnode *vp) 4367 { 4368 struct thread *td; 4369 struct mount *mp; 4370 vm_object_t object; 4371 bool active, doinginact, oweinact; 4372 4373 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4374 ASSERT_VI_LOCKED(vp, "vgonel"); 4375 VNASSERT(vp->v_holdcnt, vp, 4376 ("vgonel: vp %p has no reference.", vp)); 4377 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4378 td = curthread; 4379 4380 /* 4381 * Don't vgonel if we're already doomed. 4382 */ 4383 if (VN_IS_DOOMED(vp)) { 4384 VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \ 4385 vn_get_state(vp) == VSTATE_DEAD, vp); 4386 return; 4387 } 4388 /* 4389 * Paired with freevnode. 4390 */ 4391 vn_seqc_write_begin_locked(vp); 4392 vunlazy_gone(vp); 4393 vn_irflag_set_locked(vp, VIRF_DOOMED); 4394 vn_set_state(vp, VSTATE_DESTROYING); 4395 4396 /* 4397 * Check to see if the vnode is in use. If so, we have to 4398 * call VOP_CLOSE() and VOP_INACTIVE(). 4399 * 4400 * It could be that VOP_INACTIVE() requested reclamation, in 4401 * which case we should avoid recursion, so check 4402 * VI_DOINGINACT. This is not precise but good enough. 4403 */ 4404 active = vp->v_usecount > 0; 4405 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4406 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; 4407 4408 /* 4409 * If we need to do inactive VI_OWEINACT will be set. 4410 */ 4411 if (vp->v_iflag & VI_DEFINACT) { 4412 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4413 vp->v_iflag &= ~VI_DEFINACT; 4414 vdropl(vp); 4415 } else { 4416 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4417 VI_UNLOCK(vp); 4418 } 4419 cache_purge_vgone(vp); 4420 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4421 4422 /* 4423 * If purging an active vnode, it must be closed and 4424 * deactivated before being reclaimed. 4425 */ 4426 if (active) 4427 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4428 if (!doinginact) { 4429 do { 4430 if (oweinact || active) { 4431 VI_LOCK(vp); 4432 vinactivef(vp); 4433 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4434 VI_UNLOCK(vp); 4435 } 4436 } while (oweinact); 4437 } 4438 if (vp->v_type == VSOCK) 4439 vfs_unp_reclaim(vp); 4440 4441 /* 4442 * Clean out any buffers associated with the vnode. 4443 * If the flush fails, just toss the buffers. 4444 */ 4445 mp = NULL; 4446 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4447 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4448 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4449 while (vinvalbuf(vp, 0, 0, 0) != 0) 4450 ; 4451 } 4452 4453 BO_LOCK(&vp->v_bufobj); 4454 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4455 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4456 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4457 vp->v_bufobj.bo_clean.bv_cnt == 0, 4458 ("vp %p bufobj not invalidated", vp)); 4459 4460 /* 4461 * For VMIO bufobj, BO_DEAD is set later, or in 4462 * vm_object_terminate() after the object's page queue is 4463 * flushed. 4464 */ 4465 object = vp->v_bufobj.bo_object; 4466 if (object == NULL) 4467 vp->v_bufobj.bo_flag |= BO_DEAD; 4468 BO_UNLOCK(&vp->v_bufobj); 4469 4470 /* 4471 * Handle the VM part. Tmpfs handles v_object on its own (the 4472 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4473 * should not touch the object borrowed from the lower vnode 4474 * (the handle check). 4475 */ 4476 if (object != NULL && object->type == OBJT_VNODE && 4477 object->handle == vp) 4478 vnode_destroy_vobject(vp); 4479 4480 /* 4481 * Reclaim the vnode. 4482 */ 4483 if (VOP_RECLAIM(vp)) 4484 panic("vgone: cannot reclaim"); 4485 if (mp != NULL) 4486 vn_finished_secondary_write(mp); 4487 VNASSERT(vp->v_object == NULL, vp, 4488 ("vop_reclaim left v_object vp=%p", vp)); 4489 /* 4490 * Clear the advisory locks and wake up waiting threads. 4491 */ 4492 if (vp->v_lockf != NULL) { 4493 (void)VOP_ADVLOCKPURGE(vp); 4494 vp->v_lockf = NULL; 4495 } 4496 /* 4497 * Delete from old mount point vnode list. 4498 */ 4499 if (vp->v_mount == NULL) { 4500 VI_LOCK(vp); 4501 } else { 4502 delmntque(vp); 4503 ASSERT_VI_LOCKED(vp, "vgonel 2"); 4504 } 4505 /* 4506 * Done with purge, reset to the standard lock and invalidate 4507 * the vnode. 4508 */ 4509 vp->v_vnlock = &vp->v_lock; 4510 vp->v_op = &dead_vnodeops; 4511 vp->v_type = VBAD; 4512 vn_set_state(vp, VSTATE_DEAD); 4513 } 4514 4515 /* 4516 * Print out a description of a vnode. 4517 */ 4518 static const char *const vtypename[] = { 4519 [VNON] = "VNON", 4520 [VREG] = "VREG", 4521 [VDIR] = "VDIR", 4522 [VBLK] = "VBLK", 4523 [VCHR] = "VCHR", 4524 [VLNK] = "VLNK", 4525 [VSOCK] = "VSOCK", 4526 [VFIFO] = "VFIFO", 4527 [VBAD] = "VBAD", 4528 [VMARKER] = "VMARKER", 4529 }; 4530 _Static_assert(nitems(vtypename) == VLASTTYPE + 1, 4531 "vnode type name not added to vtypename"); 4532 4533 static const char *const vstatename[] = { 4534 [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED", 4535 [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED", 4536 [VSTATE_DESTROYING] = "VSTATE_DESTROYING", 4537 [VSTATE_DEAD] = "VSTATE_DEAD", 4538 }; 4539 _Static_assert(nitems(vstatename) == VLASTSTATE + 1, 4540 "vnode state name not added to vstatename"); 4541 4542 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4543 "new hold count flag not added to vn_printf"); 4544 4545 void 4546 vn_printf(struct vnode *vp, const char *fmt, ...) 4547 { 4548 va_list ap; 4549 char buf[256], buf2[16]; 4550 u_long flags; 4551 u_int holdcnt; 4552 short irflag; 4553 4554 va_start(ap, fmt); 4555 vprintf(fmt, ap); 4556 va_end(ap); 4557 printf("%p: ", (void *)vp); 4558 printf("type %s state %s op %p\n", vtypename[vp->v_type], 4559 vstatename[vp->v_state], vp->v_op); 4560 holdcnt = atomic_load_int(&vp->v_holdcnt); 4561 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4562 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4563 vp->v_seqc_users); 4564 switch (vp->v_type) { 4565 case VDIR: 4566 printf(" mountedhere %p\n", vp->v_mountedhere); 4567 break; 4568 case VCHR: 4569 printf(" rdev %p\n", vp->v_rdev); 4570 break; 4571 case VSOCK: 4572 printf(" socket %p\n", vp->v_unpcb); 4573 break; 4574 case VFIFO: 4575 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4576 break; 4577 default: 4578 printf("\n"); 4579 break; 4580 } 4581 buf[0] = '\0'; 4582 buf[1] = '\0'; 4583 if (holdcnt & VHOLD_NO_SMR) 4584 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4585 printf(" hold count flags (%s)\n", buf + 1); 4586 4587 buf[0] = '\0'; 4588 buf[1] = '\0'; 4589 irflag = vn_irflag_read(vp); 4590 if (irflag & VIRF_DOOMED) 4591 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4592 if (irflag & VIRF_PGREAD) 4593 strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); 4594 if (irflag & VIRF_MOUNTPOINT) 4595 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); 4596 if (irflag & VIRF_TEXT_REF) 4597 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); 4598 flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); 4599 if (flags != 0) { 4600 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4601 strlcat(buf, buf2, sizeof(buf)); 4602 } 4603 if (vp->v_vflag & VV_ROOT) 4604 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4605 if (vp->v_vflag & VV_ISTTY) 4606 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4607 if (vp->v_vflag & VV_NOSYNC) 4608 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4609 if (vp->v_vflag & VV_ETERNALDEV) 4610 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4611 if (vp->v_vflag & VV_CACHEDLABEL) 4612 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4613 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4614 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4615 if (vp->v_vflag & VV_COPYONWRITE) 4616 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4617 if (vp->v_vflag & VV_SYSTEM) 4618 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4619 if (vp->v_vflag & VV_PROCDEP) 4620 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4621 if (vp->v_vflag & VV_DELETED) 4622 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4623 if (vp->v_vflag & VV_MD) 4624 strlcat(buf, "|VV_MD", sizeof(buf)); 4625 if (vp->v_vflag & VV_FORCEINSMQ) 4626 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4627 if (vp->v_vflag & VV_READLINK) 4628 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4629 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4630 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | 4631 VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); 4632 if (flags != 0) { 4633 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4634 strlcat(buf, buf2, sizeof(buf)); 4635 } 4636 if (vp->v_iflag & VI_MOUNT) 4637 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4638 if (vp->v_iflag & VI_DOINGINACT) 4639 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4640 if (vp->v_iflag & VI_OWEINACT) 4641 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4642 if (vp->v_iflag & VI_DEFINACT) 4643 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4644 if (vp->v_iflag & VI_FOPENING) 4645 strlcat(buf, "|VI_FOPENING", sizeof(buf)); 4646 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | 4647 VI_OWEINACT | VI_DEFINACT | VI_FOPENING); 4648 if (flags != 0) { 4649 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4650 strlcat(buf, buf2, sizeof(buf)); 4651 } 4652 if (vp->v_mflag & VMP_LAZYLIST) 4653 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4654 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4655 if (flags != 0) { 4656 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4657 strlcat(buf, buf2, sizeof(buf)); 4658 } 4659 printf(" flags (%s)", buf + 1); 4660 if (mtx_owned(VI_MTX(vp))) 4661 printf(" VI_LOCKed"); 4662 printf("\n"); 4663 if (vp->v_object != NULL) 4664 printf(" v_object %p ref %d pages %d " 4665 "cleanbuf %d dirtybuf %d\n", 4666 vp->v_object, vp->v_object->ref_count, 4667 vp->v_object->resident_page_count, 4668 vp->v_bufobj.bo_clean.bv_cnt, 4669 vp->v_bufobj.bo_dirty.bv_cnt); 4670 printf(" "); 4671 lockmgr_printinfo(vp->v_vnlock); 4672 if (vp->v_data != NULL) 4673 VOP_PRINT(vp); 4674 } 4675 4676 #ifdef DDB 4677 /* 4678 * List all of the locked vnodes in the system. 4679 * Called when debugging the kernel. 4680 */ 4681 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) 4682 { 4683 struct mount *mp; 4684 struct vnode *vp; 4685 4686 /* 4687 * Note: because this is DDB, we can't obey the locking semantics 4688 * for these structures, which means we could catch an inconsistent 4689 * state and dereference a nasty pointer. Not much to be done 4690 * about that. 4691 */ 4692 db_printf("Locked vnodes\n"); 4693 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4694 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4695 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4696 vn_printf(vp, "vnode "); 4697 } 4698 } 4699 } 4700 4701 /* 4702 * Show details about the given vnode. 4703 */ 4704 DB_SHOW_COMMAND(vnode, db_show_vnode) 4705 { 4706 struct vnode *vp; 4707 4708 if (!have_addr) 4709 return; 4710 vp = (struct vnode *)addr; 4711 vn_printf(vp, "vnode "); 4712 } 4713 4714 /* 4715 * Show details about the given mount point. 4716 */ 4717 DB_SHOW_COMMAND(mount, db_show_mount) 4718 { 4719 struct mount *mp; 4720 struct vfsopt *opt; 4721 struct statfs *sp; 4722 struct vnode *vp; 4723 char buf[512]; 4724 uint64_t mflags; 4725 u_int flags; 4726 4727 if (!have_addr) { 4728 /* No address given, print short info about all mount points. */ 4729 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4730 db_printf("%p %s on %s (%s)\n", mp, 4731 mp->mnt_stat.f_mntfromname, 4732 mp->mnt_stat.f_mntonname, 4733 mp->mnt_stat.f_fstypename); 4734 if (db_pager_quit) 4735 break; 4736 } 4737 db_printf("\nMore info: show mount <addr>\n"); 4738 return; 4739 } 4740 4741 mp = (struct mount *)addr; 4742 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4743 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4744 4745 buf[0] = '\0'; 4746 mflags = mp->mnt_flag; 4747 #define MNT_FLAG(flag) do { \ 4748 if (mflags & (flag)) { \ 4749 if (buf[0] != '\0') \ 4750 strlcat(buf, ", ", sizeof(buf)); \ 4751 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4752 mflags &= ~(flag); \ 4753 } \ 4754 } while (0) 4755 MNT_FLAG(MNT_RDONLY); 4756 MNT_FLAG(MNT_SYNCHRONOUS); 4757 MNT_FLAG(MNT_NOEXEC); 4758 MNT_FLAG(MNT_NOSUID); 4759 MNT_FLAG(MNT_NFS4ACLS); 4760 MNT_FLAG(MNT_UNION); 4761 MNT_FLAG(MNT_ASYNC); 4762 MNT_FLAG(MNT_SUIDDIR); 4763 MNT_FLAG(MNT_SOFTDEP); 4764 MNT_FLAG(MNT_NOSYMFOLLOW); 4765 MNT_FLAG(MNT_GJOURNAL); 4766 MNT_FLAG(MNT_MULTILABEL); 4767 MNT_FLAG(MNT_ACLS); 4768 MNT_FLAG(MNT_NOATIME); 4769 MNT_FLAG(MNT_NOCLUSTERR); 4770 MNT_FLAG(MNT_NOCLUSTERW); 4771 MNT_FLAG(MNT_SUJ); 4772 MNT_FLAG(MNT_EXRDONLY); 4773 MNT_FLAG(MNT_EXPORTED); 4774 MNT_FLAG(MNT_DEFEXPORTED); 4775 MNT_FLAG(MNT_EXPORTANON); 4776 MNT_FLAG(MNT_EXKERB); 4777 MNT_FLAG(MNT_EXPUBLIC); 4778 MNT_FLAG(MNT_LOCAL); 4779 MNT_FLAG(MNT_QUOTA); 4780 MNT_FLAG(MNT_ROOTFS); 4781 MNT_FLAG(MNT_USER); 4782 MNT_FLAG(MNT_IGNORE); 4783 MNT_FLAG(MNT_UPDATE); 4784 MNT_FLAG(MNT_DELEXPORT); 4785 MNT_FLAG(MNT_RELOAD); 4786 MNT_FLAG(MNT_FORCE); 4787 MNT_FLAG(MNT_SNAPSHOT); 4788 MNT_FLAG(MNT_BYFSID); 4789 MNT_FLAG(MNT_NAMEDATTR); 4790 #undef MNT_FLAG 4791 if (mflags != 0) { 4792 if (buf[0] != '\0') 4793 strlcat(buf, ", ", sizeof(buf)); 4794 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4795 "0x%016jx", mflags); 4796 } 4797 db_printf(" mnt_flag = %s\n", buf); 4798 4799 buf[0] = '\0'; 4800 flags = mp->mnt_kern_flag; 4801 #define MNT_KERN_FLAG(flag) do { \ 4802 if (flags & (flag)) { \ 4803 if (buf[0] != '\0') \ 4804 strlcat(buf, ", ", sizeof(buf)); \ 4805 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4806 flags &= ~(flag); \ 4807 } \ 4808 } while (0) 4809 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4810 MNT_KERN_FLAG(MNTK_ASYNC); 4811 MNT_KERN_FLAG(MNTK_SOFTDEP); 4812 MNT_KERN_FLAG(MNTK_NOMSYNC); 4813 MNT_KERN_FLAG(MNTK_DRAINING); 4814 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4815 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4816 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4817 MNT_KERN_FLAG(MNTK_NO_IOPF); 4818 MNT_KERN_FLAG(MNTK_RECURSE); 4819 MNT_KERN_FLAG(MNTK_UPPER_WAITER); 4820 MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); 4821 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4822 MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); 4823 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4824 MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); 4825 MNT_KERN_FLAG(MNTK_NOASYNC); 4826 MNT_KERN_FLAG(MNTK_UNMOUNT); 4827 MNT_KERN_FLAG(MNTK_MWAIT); 4828 MNT_KERN_FLAG(MNTK_SUSPEND); 4829 MNT_KERN_FLAG(MNTK_SUSPEND2); 4830 MNT_KERN_FLAG(MNTK_SUSPENDED); 4831 MNT_KERN_FLAG(MNTK_NULL_NOCACHE); 4832 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4833 #undef MNT_KERN_FLAG 4834 if (flags != 0) { 4835 if (buf[0] != '\0') 4836 strlcat(buf, ", ", sizeof(buf)); 4837 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4838 "0x%08x", flags); 4839 } 4840 db_printf(" mnt_kern_flag = %s\n", buf); 4841 4842 db_printf(" mnt_opt = "); 4843 opt = TAILQ_FIRST(mp->mnt_opt); 4844 if (opt != NULL) { 4845 db_printf("%s", opt->name); 4846 opt = TAILQ_NEXT(opt, link); 4847 while (opt != NULL) { 4848 db_printf(", %s", opt->name); 4849 opt = TAILQ_NEXT(opt, link); 4850 } 4851 } 4852 db_printf("\n"); 4853 4854 sp = &mp->mnt_stat; 4855 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4856 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4857 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4858 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4859 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4860 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4861 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4862 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4863 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4864 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4865 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4866 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4867 4868 db_printf(" mnt_cred = { uid=%u ruid=%u", 4869 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4870 if (jailed(mp->mnt_cred)) 4871 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4872 db_printf(" }\n"); 4873 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4874 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4875 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4876 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4877 db_printf(" mnt_lazyvnodelistsize = %d\n", 4878 mp->mnt_lazyvnodelistsize); 4879 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4880 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4881 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4882 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4883 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4884 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4885 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4886 db_printf(" mnt_secondary_accwrites = %d\n", 4887 mp->mnt_secondary_accwrites); 4888 db_printf(" mnt_gjprovider = %s\n", 4889 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4890 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4891 4892 db_printf("\n\nList of active vnodes\n"); 4893 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4894 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4895 vn_printf(vp, "vnode "); 4896 if (db_pager_quit) 4897 break; 4898 } 4899 } 4900 db_printf("\n\nList of inactive vnodes\n"); 4901 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4902 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4903 vn_printf(vp, "vnode "); 4904 if (db_pager_quit) 4905 break; 4906 } 4907 } 4908 } 4909 #endif /* DDB */ 4910 4911 /* 4912 * Fill in a struct xvfsconf based on a struct vfsconf. 4913 */ 4914 static int 4915 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4916 { 4917 struct xvfsconf xvfsp; 4918 4919 bzero(&xvfsp, sizeof(xvfsp)); 4920 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4921 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4922 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4923 xvfsp.vfc_flags = vfsp->vfc_flags; 4924 /* 4925 * These are unused in userland, we keep them 4926 * to not break binary compatibility. 4927 */ 4928 xvfsp.vfc_vfsops = NULL; 4929 xvfsp.vfc_next = NULL; 4930 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4931 } 4932 4933 #ifdef COMPAT_FREEBSD32 4934 struct xvfsconf32 { 4935 uint32_t vfc_vfsops; 4936 char vfc_name[MFSNAMELEN]; 4937 int32_t vfc_typenum; 4938 int32_t vfc_refcount; 4939 int32_t vfc_flags; 4940 uint32_t vfc_next; 4941 }; 4942 4943 static int 4944 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4945 { 4946 struct xvfsconf32 xvfsp; 4947 4948 bzero(&xvfsp, sizeof(xvfsp)); 4949 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4950 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4951 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4952 xvfsp.vfc_flags = vfsp->vfc_flags; 4953 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4954 } 4955 #endif 4956 4957 /* 4958 * Top level filesystem related information gathering. 4959 */ 4960 static int 4961 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4962 { 4963 struct vfsconf *vfsp; 4964 int error; 4965 4966 error = 0; 4967 vfsconf_slock(); 4968 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4969 #ifdef COMPAT_FREEBSD32 4970 if (req->flags & SCTL_MASK32) 4971 error = vfsconf2x32(req, vfsp); 4972 else 4973 #endif 4974 error = vfsconf2x(req, vfsp); 4975 if (error) 4976 break; 4977 } 4978 vfsconf_sunlock(); 4979 return (error); 4980 } 4981 4982 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4983 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4984 "S,xvfsconf", "List of all configured filesystems"); 4985 4986 #ifndef BURN_BRIDGES 4987 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4988 4989 static int 4990 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4991 { 4992 int *name = (int *)arg1 - 1; /* XXX */ 4993 u_int namelen = arg2 + 1; /* XXX */ 4994 struct vfsconf *vfsp; 4995 4996 log(LOG_WARNING, "userland calling deprecated sysctl, " 4997 "please rebuild world\n"); 4998 4999 #if 1 || defined(COMPAT_PRELITE2) 5000 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 5001 if (namelen == 1) 5002 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 5003 #endif 5004 5005 switch (name[1]) { 5006 case VFS_MAXTYPENUM: 5007 if (namelen != 2) 5008 return (ENOTDIR); 5009 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 5010 case VFS_CONF: 5011 if (namelen != 3) 5012 return (ENOTDIR); /* overloaded */ 5013 vfsconf_slock(); 5014 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 5015 if (vfsp->vfc_typenum == name[2]) 5016 break; 5017 } 5018 vfsconf_sunlock(); 5019 if (vfsp == NULL) 5020 return (EOPNOTSUPP); 5021 #ifdef COMPAT_FREEBSD32 5022 if (req->flags & SCTL_MASK32) 5023 return (vfsconf2x32(req, vfsp)); 5024 else 5025 #endif 5026 return (vfsconf2x(req, vfsp)); 5027 } 5028 return (EOPNOTSUPP); 5029 } 5030 5031 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 5032 CTLFLAG_MPSAFE, vfs_sysctl, 5033 "Generic filesystem"); 5034 5035 #if 1 || defined(COMPAT_PRELITE2) 5036 5037 static int 5038 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 5039 { 5040 int error; 5041 struct vfsconf *vfsp; 5042 struct ovfsconf ovfs; 5043 5044 vfsconf_slock(); 5045 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 5046 bzero(&ovfs, sizeof(ovfs)); 5047 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 5048 strcpy(ovfs.vfc_name, vfsp->vfc_name); 5049 ovfs.vfc_index = vfsp->vfc_typenum; 5050 ovfs.vfc_refcount = vfsp->vfc_refcount; 5051 ovfs.vfc_flags = vfsp->vfc_flags; 5052 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 5053 if (error != 0) { 5054 vfsconf_sunlock(); 5055 return (error); 5056 } 5057 } 5058 vfsconf_sunlock(); 5059 return (0); 5060 } 5061 5062 #endif /* 1 || COMPAT_PRELITE2 */ 5063 #endif /* !BURN_BRIDGES */ 5064 5065 static void 5066 unmount_or_warn(struct mount *mp) 5067 { 5068 int error; 5069 5070 error = dounmount(mp, MNT_FORCE, curthread); 5071 if (error != 0) { 5072 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 5073 if (error == EBUSY) 5074 printf("BUSY)\n"); 5075 else 5076 printf("%d)\n", error); 5077 } 5078 } 5079 5080 /* 5081 * Unmount all filesystems. The list is traversed in reverse order 5082 * of mounting to avoid dependencies. 5083 */ 5084 void 5085 vfs_unmountall(void) 5086 { 5087 struct mount *mp, *tmp; 5088 5089 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 5090 5091 /* 5092 * Since this only runs when rebooting, it is not interlocked. 5093 */ 5094 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 5095 vfs_ref(mp); 5096 5097 /* 5098 * Forcibly unmounting "/dev" before "/" would prevent clean 5099 * unmount of the latter. 5100 */ 5101 if (mp == rootdevmp) 5102 continue; 5103 5104 unmount_or_warn(mp); 5105 } 5106 5107 if (rootdevmp != NULL) 5108 unmount_or_warn(rootdevmp); 5109 } 5110 5111 static void 5112 vfs_deferred_inactive(struct vnode *vp, int lkflags) 5113 { 5114 5115 ASSERT_VI_LOCKED(vp, __func__); 5116 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 5117 if ((vp->v_iflag & VI_OWEINACT) == 0) { 5118 vdropl(vp); 5119 return; 5120 } 5121 if (vn_lock(vp, lkflags) == 0) { 5122 VI_LOCK(vp); 5123 vinactive(vp); 5124 VOP_UNLOCK(vp); 5125 vdropl(vp); 5126 return; 5127 } 5128 vdefer_inactive_unlocked(vp); 5129 } 5130 5131 static int 5132 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 5133 { 5134 5135 return (vp->v_iflag & VI_DEFINACT); 5136 } 5137 5138 static void __noinline 5139 vfs_periodic_inactive(struct mount *mp, int flags) 5140 { 5141 struct vnode *vp, *mvp; 5142 int lkflags; 5143 5144 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 5145 if (flags != MNT_WAIT) 5146 lkflags |= LK_NOWAIT; 5147 5148 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 5149 if ((vp->v_iflag & VI_DEFINACT) == 0) { 5150 VI_UNLOCK(vp); 5151 continue; 5152 } 5153 vp->v_iflag &= ~VI_DEFINACT; 5154 vfs_deferred_inactive(vp, lkflags); 5155 } 5156 } 5157 5158 static inline bool 5159 vfs_want_msync(struct vnode *vp) 5160 { 5161 struct vm_object *obj; 5162 5163 /* 5164 * This test may be performed without any locks held. 5165 * We rely on vm_object's type stability. 5166 */ 5167 if (vp->v_vflag & VV_NOSYNC) 5168 return (false); 5169 obj = vp->v_object; 5170 return (obj != NULL && vm_object_mightbedirty(obj)); 5171 } 5172 5173 static int 5174 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 5175 { 5176 5177 if (vp->v_vflag & VV_NOSYNC) 5178 return (false); 5179 if (vp->v_iflag & VI_DEFINACT) 5180 return (true); 5181 return (vfs_want_msync(vp)); 5182 } 5183 5184 static void __noinline 5185 vfs_periodic_msync_inactive(struct mount *mp, int flags) 5186 { 5187 struct vnode *vp, *mvp; 5188 int lkflags; 5189 bool seen_defer; 5190 5191 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 5192 if (flags != MNT_WAIT) 5193 lkflags |= LK_NOWAIT; 5194 5195 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 5196 seen_defer = false; 5197 if (vp->v_iflag & VI_DEFINACT) { 5198 vp->v_iflag &= ~VI_DEFINACT; 5199 seen_defer = true; 5200 } 5201 if (!vfs_want_msync(vp)) { 5202 if (seen_defer) 5203 vfs_deferred_inactive(vp, lkflags); 5204 else 5205 VI_UNLOCK(vp); 5206 continue; 5207 } 5208 if (vget(vp, lkflags) == 0) { 5209 if ((vp->v_vflag & VV_NOSYNC) == 0) { 5210 if (flags == MNT_WAIT) 5211 vnode_pager_clean_sync(vp); 5212 else 5213 vnode_pager_clean_async(vp); 5214 } 5215 vput(vp); 5216 if (seen_defer) 5217 vdrop(vp); 5218 } else { 5219 if (seen_defer) 5220 vdefer_inactive_unlocked(vp); 5221 } 5222 } 5223 } 5224 5225 void 5226 vfs_periodic(struct mount *mp, int flags) 5227 { 5228 5229 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 5230 5231 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 5232 vfs_periodic_inactive(mp, flags); 5233 else 5234 vfs_periodic_msync_inactive(mp, flags); 5235 } 5236 5237 static void 5238 destroy_vpollinfo_free(struct vpollinfo *vi) 5239 { 5240 5241 knlist_destroy(&vi->vpi_selinfo.si_note); 5242 mtx_destroy(&vi->vpi_lock); 5243 free(vi, M_VNODEPOLL); 5244 } 5245 5246 static void 5247 destroy_vpollinfo(struct vpollinfo *vi) 5248 { 5249 5250 knlist_clear(&vi->vpi_selinfo.si_note, 1); 5251 seldrain(&vi->vpi_selinfo); 5252 destroy_vpollinfo_free(vi); 5253 } 5254 5255 /* 5256 * Initialize per-vnode helper structure to hold poll-related state. 5257 */ 5258 void 5259 v_addpollinfo(struct vnode *vp) 5260 { 5261 struct vpollinfo *vi; 5262 5263 if (vp->v_pollinfo != NULL) 5264 return; 5265 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); 5266 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 5267 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 5268 vfs_knlunlock, vfs_knl_assert_lock); 5269 VI_LOCK(vp); 5270 if (vp->v_pollinfo != NULL) { 5271 VI_UNLOCK(vp); 5272 destroy_vpollinfo_free(vi); 5273 return; 5274 } 5275 vp->v_pollinfo = vi; 5276 VI_UNLOCK(vp); 5277 } 5278 5279 /* 5280 * Record a process's interest in events which might happen to 5281 * a vnode. Because poll uses the historic select-style interface 5282 * internally, this routine serves as both the ``check for any 5283 * pending events'' and the ``record my interest in future events'' 5284 * functions. (These are done together, while the lock is held, 5285 * to avoid race conditions.) 5286 */ 5287 int 5288 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 5289 { 5290 5291 v_addpollinfo(vp); 5292 mtx_lock(&vp->v_pollinfo->vpi_lock); 5293 if (vp->v_pollinfo->vpi_revents & events) { 5294 /* 5295 * This leaves events we are not interested 5296 * in available for the other process which 5297 * which presumably had requested them 5298 * (otherwise they would never have been 5299 * recorded). 5300 */ 5301 events &= vp->v_pollinfo->vpi_revents; 5302 vp->v_pollinfo->vpi_revents &= ~events; 5303 5304 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5305 return (events); 5306 } 5307 vp->v_pollinfo->vpi_events |= events; 5308 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 5309 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5310 return (0); 5311 } 5312 5313 /* 5314 * Routine to create and manage a filesystem syncer vnode. 5315 */ 5316 #define sync_close ((int (*)(struct vop_close_args *))nullop) 5317 static int sync_fsync(struct vop_fsync_args *); 5318 static int sync_inactive(struct vop_inactive_args *); 5319 static int sync_reclaim(struct vop_reclaim_args *); 5320 5321 static struct vop_vector sync_vnodeops = { 5322 .vop_bypass = VOP_EOPNOTSUPP, 5323 .vop_close = sync_close, 5324 .vop_fsync = sync_fsync, 5325 .vop_getwritemount = vop_stdgetwritemount, 5326 .vop_inactive = sync_inactive, 5327 .vop_need_inactive = vop_stdneed_inactive, 5328 .vop_reclaim = sync_reclaim, 5329 .vop_lock1 = vop_stdlock, 5330 .vop_unlock = vop_stdunlock, 5331 .vop_islocked = vop_stdislocked, 5332 .vop_fplookup_vexec = VOP_EAGAIN, 5333 .vop_fplookup_symlink = VOP_EAGAIN, 5334 }; 5335 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 5336 5337 /* 5338 * Create a new filesystem syncer vnode for the specified mount point. 5339 */ 5340 void 5341 vfs_allocate_syncvnode(struct mount *mp) 5342 { 5343 struct vnode *vp; 5344 struct bufobj *bo; 5345 static long start, incr, next; 5346 int error; 5347 5348 /* Allocate a new vnode */ 5349 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 5350 if (error != 0) 5351 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 5352 vp->v_type = VNON; 5353 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5354 vp->v_vflag |= VV_FORCEINSMQ; 5355 error = insmntque1(vp, mp); 5356 if (error != 0) 5357 panic("vfs_allocate_syncvnode: insmntque() failed"); 5358 vp->v_vflag &= ~VV_FORCEINSMQ; 5359 vn_set_state(vp, VSTATE_CONSTRUCTED); 5360 VOP_UNLOCK(vp); 5361 /* 5362 * Place the vnode onto the syncer worklist. We attempt to 5363 * scatter them about on the list so that they will go off 5364 * at evenly distributed times even if all the filesystems 5365 * are mounted at once. 5366 */ 5367 next += incr; 5368 if (next == 0 || next > syncer_maxdelay) { 5369 start /= 2; 5370 incr /= 2; 5371 if (start == 0) { 5372 start = syncer_maxdelay / 2; 5373 incr = syncer_maxdelay; 5374 } 5375 next = start; 5376 } 5377 bo = &vp->v_bufobj; 5378 BO_LOCK(bo); 5379 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5380 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5381 mtx_lock(&sync_mtx); 5382 sync_vnode_count++; 5383 if (mp->mnt_syncer == NULL) { 5384 mp->mnt_syncer = vp; 5385 vp = NULL; 5386 } 5387 mtx_unlock(&sync_mtx); 5388 BO_UNLOCK(bo); 5389 if (vp != NULL) { 5390 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5391 vgone(vp); 5392 vput(vp); 5393 } 5394 } 5395 5396 void 5397 vfs_deallocate_syncvnode(struct mount *mp) 5398 { 5399 struct vnode *vp; 5400 5401 mtx_lock(&sync_mtx); 5402 vp = mp->mnt_syncer; 5403 if (vp != NULL) 5404 mp->mnt_syncer = NULL; 5405 mtx_unlock(&sync_mtx); 5406 if (vp != NULL) 5407 vrele(vp); 5408 } 5409 5410 /* 5411 * Do a lazy sync of the filesystem. 5412 */ 5413 static int 5414 sync_fsync(struct vop_fsync_args *ap) 5415 { 5416 struct vnode *syncvp = ap->a_vp; 5417 struct mount *mp = syncvp->v_mount; 5418 int error, save; 5419 struct bufobj *bo; 5420 5421 /* 5422 * We only need to do something if this is a lazy evaluation. 5423 */ 5424 if (ap->a_waitfor != MNT_LAZY) 5425 return (0); 5426 5427 /* 5428 * Move ourselves to the back of the sync list. 5429 */ 5430 bo = &syncvp->v_bufobj; 5431 BO_LOCK(bo); 5432 vn_syncer_add_to_worklist(bo, syncdelay); 5433 BO_UNLOCK(bo); 5434 5435 /* 5436 * Walk the list of vnodes pushing all that are dirty and 5437 * not already on the sync list. 5438 */ 5439 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5440 return (0); 5441 VOP_UNLOCK(syncvp); 5442 save = curthread_pflags_set(TDP_SYNCIO); 5443 /* 5444 * The filesystem at hand may be idle with free vnodes stored in the 5445 * batch. Return them instead of letting them stay there indefinitely. 5446 */ 5447 vfs_periodic(mp, MNT_NOWAIT); 5448 error = VFS_SYNC(mp, MNT_LAZY); 5449 curthread_pflags_restore(save); 5450 vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); 5451 vfs_unbusy(mp); 5452 return (error); 5453 } 5454 5455 /* 5456 * The syncer vnode is no referenced. 5457 */ 5458 static int 5459 sync_inactive(struct vop_inactive_args *ap) 5460 { 5461 5462 vgone(ap->a_vp); 5463 return (0); 5464 } 5465 5466 /* 5467 * The syncer vnode is no longer needed and is being decommissioned. 5468 * 5469 * Modifications to the worklist must be protected by sync_mtx. 5470 */ 5471 static int 5472 sync_reclaim(struct vop_reclaim_args *ap) 5473 { 5474 struct vnode *vp = ap->a_vp; 5475 struct bufobj *bo; 5476 5477 bo = &vp->v_bufobj; 5478 BO_LOCK(bo); 5479 mtx_lock(&sync_mtx); 5480 if (vp->v_mount->mnt_syncer == vp) 5481 vp->v_mount->mnt_syncer = NULL; 5482 if (bo->bo_flag & BO_ONWORKLST) { 5483 LIST_REMOVE(bo, bo_synclist); 5484 syncer_worklist_len--; 5485 sync_vnode_count--; 5486 bo->bo_flag &= ~BO_ONWORKLST; 5487 } 5488 mtx_unlock(&sync_mtx); 5489 BO_UNLOCK(bo); 5490 5491 return (0); 5492 } 5493 5494 int 5495 vn_need_pageq_flush(struct vnode *vp) 5496 { 5497 struct vm_object *obj; 5498 5499 obj = vp->v_object; 5500 return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5501 vm_object_mightbedirty(obj)); 5502 } 5503 5504 /* 5505 * Check if vnode represents a disk device 5506 */ 5507 bool 5508 vn_isdisk_error(struct vnode *vp, int *errp) 5509 { 5510 int error; 5511 5512 if (vp->v_type != VCHR) { 5513 error = ENOTBLK; 5514 goto out; 5515 } 5516 error = 0; 5517 dev_lock(); 5518 if (vp->v_rdev == NULL) 5519 error = ENXIO; 5520 else if (vp->v_rdev->si_devsw == NULL) 5521 error = ENXIO; 5522 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5523 error = ENOTBLK; 5524 dev_unlock(); 5525 out: 5526 *errp = error; 5527 return (error == 0); 5528 } 5529 5530 bool 5531 vn_isdisk(struct vnode *vp) 5532 { 5533 int error; 5534 5535 return (vn_isdisk_error(vp, &error)); 5536 } 5537 5538 /* 5539 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5540 * the comment above cache_fplookup for details. 5541 */ 5542 int 5543 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5544 { 5545 int error; 5546 5547 VFS_SMR_ASSERT_ENTERED(); 5548 5549 /* Check the owner. */ 5550 if (cred->cr_uid == file_uid) { 5551 if (file_mode & S_IXUSR) 5552 return (0); 5553 goto out_error; 5554 } 5555 5556 /* Otherwise, check the groups (first match) */ 5557 if (groupmember(file_gid, cred)) { 5558 if (file_mode & S_IXGRP) 5559 return (0); 5560 goto out_error; 5561 } 5562 5563 /* Otherwise, check everyone else. */ 5564 if (file_mode & S_IXOTH) 5565 return (0); 5566 out_error: 5567 /* 5568 * Permission check failed, but it is possible denial will get overwritten 5569 * (e.g., when root is traversing through a 700 directory owned by someone 5570 * else). 5571 * 5572 * vaccess() calls priv_check_cred which in turn can descent into MAC 5573 * modules overriding this result. It's quite unclear what semantics 5574 * are allowed for them to operate, thus for safety we don't call them 5575 * from within the SMR section. This also means if any such modules 5576 * are present, we have to let the regular lookup decide. 5577 */ 5578 error = priv_check_cred_vfs_lookup_nomac(cred); 5579 switch (error) { 5580 case 0: 5581 return (0); 5582 case EAGAIN: 5583 /* 5584 * MAC modules present. 5585 */ 5586 return (EAGAIN); 5587 case EPERM: 5588 return (EACCES); 5589 default: 5590 return (error); 5591 } 5592 } 5593 5594 /* 5595 * Common filesystem object access control check routine. Accepts a 5596 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5597 * Returns 0 on success, or an errno on failure. 5598 */ 5599 int 5600 vaccess(__enum_uint8(vtype) type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5601 accmode_t accmode, struct ucred *cred) 5602 { 5603 accmode_t dac_granted; 5604 accmode_t priv_granted; 5605 5606 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5607 ("invalid bit in accmode")); 5608 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5609 ("VAPPEND without VWRITE")); 5610 5611 /* 5612 * Look for a normal, non-privileged way to access the file/directory 5613 * as requested. If it exists, go with that. 5614 */ 5615 5616 dac_granted = 0; 5617 5618 /* Check the owner. */ 5619 if (cred->cr_uid == file_uid) { 5620 dac_granted |= VADMIN; 5621 if (file_mode & S_IXUSR) 5622 dac_granted |= VEXEC; 5623 if (file_mode & S_IRUSR) 5624 dac_granted |= VREAD; 5625 if (file_mode & S_IWUSR) 5626 dac_granted |= (VWRITE | VAPPEND); 5627 5628 if ((accmode & dac_granted) == accmode) 5629 return (0); 5630 5631 goto privcheck; 5632 } 5633 5634 /* Otherwise, check the groups (first match) */ 5635 if (groupmember(file_gid, cred)) { 5636 if (file_mode & S_IXGRP) 5637 dac_granted |= VEXEC; 5638 if (file_mode & S_IRGRP) 5639 dac_granted |= VREAD; 5640 if (file_mode & S_IWGRP) 5641 dac_granted |= (VWRITE | VAPPEND); 5642 5643 if ((accmode & dac_granted) == accmode) 5644 return (0); 5645 5646 goto privcheck; 5647 } 5648 5649 /* Otherwise, check everyone else. */ 5650 if (file_mode & S_IXOTH) 5651 dac_granted |= VEXEC; 5652 if (file_mode & S_IROTH) 5653 dac_granted |= VREAD; 5654 if (file_mode & S_IWOTH) 5655 dac_granted |= (VWRITE | VAPPEND); 5656 if ((accmode & dac_granted) == accmode) 5657 return (0); 5658 5659 privcheck: 5660 /* 5661 * Build a privilege mask to determine if the set of privileges 5662 * satisfies the requirements when combined with the granted mask 5663 * from above. For each privilege, if the privilege is required, 5664 * bitwise or the request type onto the priv_granted mask. 5665 */ 5666 priv_granted = 0; 5667 5668 if (type == VDIR) { 5669 /* 5670 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5671 * requests, instead of PRIV_VFS_EXEC. 5672 */ 5673 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5674 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5675 priv_granted |= VEXEC; 5676 } else { 5677 /* 5678 * Ensure that at least one execute bit is on. Otherwise, 5679 * a privileged user will always succeed, and we don't want 5680 * this to happen unless the file really is executable. 5681 */ 5682 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5683 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5684 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5685 priv_granted |= VEXEC; 5686 } 5687 5688 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5689 !priv_check_cred(cred, PRIV_VFS_READ)) 5690 priv_granted |= VREAD; 5691 5692 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5693 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5694 priv_granted |= (VWRITE | VAPPEND); 5695 5696 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5697 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5698 priv_granted |= VADMIN; 5699 5700 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5701 return (0); 5702 } 5703 5704 return ((accmode & VADMIN) ? EPERM : EACCES); 5705 } 5706 5707 /* 5708 * Credential check based on process requesting service, and per-attribute 5709 * permissions. 5710 */ 5711 int 5712 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5713 struct thread *td, accmode_t accmode) 5714 { 5715 5716 /* 5717 * Kernel-invoked always succeeds. 5718 */ 5719 if (cred == NOCRED) 5720 return (0); 5721 5722 /* 5723 * Do not allow privileged processes in jail to directly manipulate 5724 * system attributes. 5725 */ 5726 switch (attrnamespace) { 5727 case EXTATTR_NAMESPACE_SYSTEM: 5728 /* Potentially should be: return (EPERM); */ 5729 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5730 case EXTATTR_NAMESPACE_USER: 5731 return (VOP_ACCESS(vp, accmode, cred, td)); 5732 default: 5733 return (EPERM); 5734 } 5735 } 5736 5737 #ifdef DEBUG_VFS_LOCKS 5738 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5739 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5740 "Drop into debugger on lock violation"); 5741 5742 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5743 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5744 0, "Check for interlock across VOPs"); 5745 5746 int vfs_badlock_print = 1; /* Print lock violations. */ 5747 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5748 0, "Print lock violations"); 5749 5750 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5751 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5752 0, "Print vnode details on lock violations"); 5753 5754 #ifdef KDB 5755 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5756 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5757 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5758 #endif 5759 5760 static void 5761 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5762 { 5763 5764 #ifdef KDB 5765 if (vfs_badlock_backtrace) 5766 kdb_backtrace(); 5767 #endif 5768 if (vfs_badlock_vnode) 5769 vn_printf(vp, "vnode "); 5770 if (vfs_badlock_print) 5771 printf("%s: %p %s\n", str, (void *)vp, msg); 5772 if (vfs_badlock_ddb) 5773 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5774 } 5775 5776 void 5777 assert_vi_locked(struct vnode *vp, const char *str) 5778 { 5779 5780 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5781 vfs_badlock("interlock is not locked but should be", str, vp); 5782 } 5783 5784 void 5785 assert_vi_unlocked(struct vnode *vp, const char *str) 5786 { 5787 5788 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5789 vfs_badlock("interlock is locked but should not be", str, vp); 5790 } 5791 5792 void 5793 assert_vop_locked(struct vnode *vp, const char *str) 5794 { 5795 if (KERNEL_PANICKED() || vp == NULL) 5796 return; 5797 5798 #ifdef WITNESS 5799 if ((vp->v_irflag & VIRF_CROSSMP) == 0 && 5800 witness_is_owned(&vp->v_vnlock->lock_object) == -1) 5801 #else 5802 int locked = VOP_ISLOCKED(vp); 5803 if (locked == 0 || locked == LK_EXCLOTHER) 5804 #endif 5805 vfs_badlock("is not locked but should be", str, vp); 5806 } 5807 5808 void 5809 assert_vop_unlocked(struct vnode *vp, const char *str) 5810 { 5811 if (KERNEL_PANICKED() || vp == NULL) 5812 return; 5813 5814 #ifdef WITNESS 5815 if ((vp->v_irflag & VIRF_CROSSMP) == 0 && 5816 witness_is_owned(&vp->v_vnlock->lock_object) == 1) 5817 #else 5818 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5819 #endif 5820 vfs_badlock("is locked but should not be", str, vp); 5821 } 5822 5823 void 5824 assert_vop_elocked(struct vnode *vp, const char *str) 5825 { 5826 if (KERNEL_PANICKED() || vp == NULL) 5827 return; 5828 5829 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5830 vfs_badlock("is not exclusive locked but should be", str, vp); 5831 } 5832 #endif /* DEBUG_VFS_LOCKS */ 5833 5834 void 5835 vop_rename_fail(struct vop_rename_args *ap) 5836 { 5837 5838 if (ap->a_tvp != NULL) 5839 vput(ap->a_tvp); 5840 if (ap->a_tdvp == ap->a_tvp) 5841 vrele(ap->a_tdvp); 5842 else 5843 vput(ap->a_tdvp); 5844 vrele(ap->a_fdvp); 5845 vrele(ap->a_fvp); 5846 } 5847 5848 void 5849 vop_rename_pre(void *ap) 5850 { 5851 struct vop_rename_args *a = ap; 5852 5853 #ifdef DEBUG_VFS_LOCKS 5854 if (a->a_tvp) 5855 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5856 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5857 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5858 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5859 5860 /* Check the source (from). */ 5861 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5862 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5863 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5864 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5865 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5866 5867 /* Check the target. */ 5868 if (a->a_tvp) 5869 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5870 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5871 #endif 5872 /* 5873 * It may be tempting to add vn_seqc_write_begin/end calls here and 5874 * in vop_rename_post but that's not going to work out since some 5875 * filesystems relookup vnodes mid-rename. This is probably a bug. 5876 * 5877 * For now filesystems are expected to do the relevant calls after they 5878 * decide what vnodes to operate on. 5879 */ 5880 if (a->a_tdvp != a->a_fdvp) 5881 vhold(a->a_fdvp); 5882 if (a->a_tvp != a->a_fvp) 5883 vhold(a->a_fvp); 5884 vhold(a->a_tdvp); 5885 if (a->a_tvp) 5886 vhold(a->a_tvp); 5887 } 5888 5889 #ifdef DEBUG_VFS_LOCKS 5890 void 5891 vop_fplookup_vexec_debugpre(void *ap __unused) 5892 { 5893 5894 VFS_SMR_ASSERT_ENTERED(); 5895 } 5896 5897 void 5898 vop_fplookup_vexec_debugpost(void *ap, int rc) 5899 { 5900 struct vop_fplookup_vexec_args *a; 5901 struct vnode *vp; 5902 5903 a = ap; 5904 vp = a->a_vp; 5905 5906 VFS_SMR_ASSERT_ENTERED(); 5907 if (rc == EOPNOTSUPP) 5908 VNPASS(VN_IS_DOOMED(vp), vp); 5909 } 5910 5911 void 5912 vop_fplookup_symlink_debugpre(void *ap __unused) 5913 { 5914 5915 VFS_SMR_ASSERT_ENTERED(); 5916 } 5917 5918 void 5919 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) 5920 { 5921 5922 VFS_SMR_ASSERT_ENTERED(); 5923 } 5924 5925 static void 5926 vop_fsync_debugprepost(struct vnode *vp, const char *name) 5927 { 5928 if (vp->v_type == VCHR) 5929 ; 5930 /* 5931 * The shared vs. exclusive locking policy for fsync() 5932 * is actually determined by vp's write mount as indicated 5933 * by VOP_GETWRITEMOUNT(), which for stacked filesystems 5934 * may not be the same as vp->v_mount. However, if the 5935 * underlying filesystem which really handles the fsync() 5936 * supports shared locking, the stacked filesystem must also 5937 * be prepared for its VOP_FSYNC() operation to be called 5938 * with only a shared lock. On the other hand, if the 5939 * stacked filesystem claims support for shared write 5940 * locking but the underlying filesystem does not, and the 5941 * caller incorrectly uses a shared lock, this condition 5942 * should still be caught when the stacked filesystem 5943 * invokes VOP_FSYNC() on the underlying filesystem. 5944 */ 5945 else if (MNT_SHARED_WRITES(vp->v_mount)) 5946 ASSERT_VOP_LOCKED(vp, name); 5947 else 5948 ASSERT_VOP_ELOCKED(vp, name); 5949 } 5950 5951 void 5952 vop_fsync_debugpre(void *a) 5953 { 5954 struct vop_fsync_args *ap; 5955 5956 ap = a; 5957 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5958 } 5959 5960 void 5961 vop_fsync_debugpost(void *a, int rc __unused) 5962 { 5963 struct vop_fsync_args *ap; 5964 5965 ap = a; 5966 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5967 } 5968 5969 void 5970 vop_fdatasync_debugpre(void *a) 5971 { 5972 struct vop_fdatasync_args *ap; 5973 5974 ap = a; 5975 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5976 } 5977 5978 void 5979 vop_fdatasync_debugpost(void *a, int rc __unused) 5980 { 5981 struct vop_fdatasync_args *ap; 5982 5983 ap = a; 5984 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5985 } 5986 5987 void 5988 vop_strategy_debugpre(void *ap) 5989 { 5990 struct vop_strategy_args *a; 5991 struct buf *bp; 5992 5993 a = ap; 5994 bp = a->a_bp; 5995 5996 /* 5997 * Cluster ops lock their component buffers but not the IO container. 5998 */ 5999 if ((bp->b_flags & B_CLUSTER) != 0) 6000 return; 6001 6002 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 6003 if (vfs_badlock_print) 6004 printf( 6005 "VOP_STRATEGY: bp is not locked but should be\n"); 6006 if (vfs_badlock_ddb) 6007 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 6008 } 6009 } 6010 6011 void 6012 vop_lock_debugpre(void *ap) 6013 { 6014 struct vop_lock1_args *a = ap; 6015 6016 if ((a->a_flags & LK_INTERLOCK) == 0) 6017 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 6018 else 6019 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 6020 } 6021 6022 void 6023 vop_lock_debugpost(void *ap, int rc) 6024 { 6025 struct vop_lock1_args *a = ap; 6026 6027 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 6028 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 6029 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 6030 } 6031 6032 void 6033 vop_unlock_debugpre(void *ap) 6034 { 6035 struct vop_unlock_args *a = ap; 6036 struct vnode *vp = a->a_vp; 6037 6038 VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp); 6039 ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK"); 6040 } 6041 6042 void 6043 vop_need_inactive_debugpre(void *ap) 6044 { 6045 struct vop_need_inactive_args *a = ap; 6046 6047 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 6048 } 6049 6050 void 6051 vop_need_inactive_debugpost(void *ap, int rc) 6052 { 6053 struct vop_need_inactive_args *a = ap; 6054 6055 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 6056 } 6057 #endif 6058 6059 void 6060 vop_create_pre(void *ap) 6061 { 6062 struct vop_create_args *a; 6063 struct vnode *dvp; 6064 6065 a = ap; 6066 dvp = a->a_dvp; 6067 vn_seqc_write_begin(dvp); 6068 } 6069 6070 void 6071 vop_create_post(void *ap, int rc) 6072 { 6073 struct vop_create_args *a; 6074 struct vnode *dvp; 6075 6076 a = ap; 6077 dvp = a->a_dvp; 6078 vn_seqc_write_end(dvp); 6079 if (!rc) 6080 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6081 } 6082 6083 void 6084 vop_whiteout_pre(void *ap) 6085 { 6086 struct vop_whiteout_args *a; 6087 struct vnode *dvp; 6088 6089 a = ap; 6090 dvp = a->a_dvp; 6091 vn_seqc_write_begin(dvp); 6092 } 6093 6094 void 6095 vop_whiteout_post(void *ap, int rc) 6096 { 6097 struct vop_whiteout_args *a; 6098 struct vnode *dvp; 6099 6100 a = ap; 6101 dvp = a->a_dvp; 6102 vn_seqc_write_end(dvp); 6103 } 6104 6105 void 6106 vop_deleteextattr_pre(void *ap) 6107 { 6108 struct vop_deleteextattr_args *a; 6109 struct vnode *vp; 6110 6111 a = ap; 6112 vp = a->a_vp; 6113 vn_seqc_write_begin(vp); 6114 } 6115 6116 void 6117 vop_deleteextattr_post(void *ap, int rc) 6118 { 6119 struct vop_deleteextattr_args *a; 6120 struct vnode *vp; 6121 6122 a = ap; 6123 vp = a->a_vp; 6124 vn_seqc_write_end(vp); 6125 if (!rc) 6126 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 6127 } 6128 6129 void 6130 vop_link_pre(void *ap) 6131 { 6132 struct vop_link_args *a; 6133 struct vnode *vp, *tdvp; 6134 6135 a = ap; 6136 vp = a->a_vp; 6137 tdvp = a->a_tdvp; 6138 vn_seqc_write_begin(vp); 6139 vn_seqc_write_begin(tdvp); 6140 } 6141 6142 void 6143 vop_link_post(void *ap, int rc) 6144 { 6145 struct vop_link_args *a; 6146 struct vnode *vp, *tdvp; 6147 6148 a = ap; 6149 vp = a->a_vp; 6150 tdvp = a->a_tdvp; 6151 vn_seqc_write_end(vp); 6152 vn_seqc_write_end(tdvp); 6153 if (!rc) { 6154 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 6155 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 6156 } 6157 } 6158 6159 void 6160 vop_mkdir_pre(void *ap) 6161 { 6162 struct vop_mkdir_args *a; 6163 struct vnode *dvp; 6164 6165 a = ap; 6166 dvp = a->a_dvp; 6167 vn_seqc_write_begin(dvp); 6168 } 6169 6170 void 6171 vop_mkdir_post(void *ap, int rc) 6172 { 6173 struct vop_mkdir_args *a; 6174 struct vnode *dvp; 6175 6176 a = ap; 6177 dvp = a->a_dvp; 6178 vn_seqc_write_end(dvp); 6179 if (!rc) 6180 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6181 } 6182 6183 #ifdef DEBUG_VFS_LOCKS 6184 void 6185 vop_mkdir_debugpost(void *ap, int rc) 6186 { 6187 struct vop_mkdir_args *a; 6188 6189 a = ap; 6190 if (!rc) 6191 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); 6192 } 6193 #endif 6194 6195 void 6196 vop_mknod_pre(void *ap) 6197 { 6198 struct vop_mknod_args *a; 6199 struct vnode *dvp; 6200 6201 a = ap; 6202 dvp = a->a_dvp; 6203 vn_seqc_write_begin(dvp); 6204 } 6205 6206 void 6207 vop_mknod_post(void *ap, int rc) 6208 { 6209 struct vop_mknod_args *a; 6210 struct vnode *dvp; 6211 6212 a = ap; 6213 dvp = a->a_dvp; 6214 vn_seqc_write_end(dvp); 6215 if (!rc) 6216 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6217 } 6218 6219 void 6220 vop_reclaim_post(void *ap, int rc) 6221 { 6222 struct vop_reclaim_args *a; 6223 struct vnode *vp; 6224 6225 a = ap; 6226 vp = a->a_vp; 6227 ASSERT_VOP_IN_SEQC(vp); 6228 if (!rc) 6229 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 6230 } 6231 6232 void 6233 vop_remove_pre(void *ap) 6234 { 6235 struct vop_remove_args *a; 6236 struct vnode *dvp, *vp; 6237 6238 a = ap; 6239 dvp = a->a_dvp; 6240 vp = a->a_vp; 6241 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); 6242 vn_seqc_write_begin(dvp); 6243 vn_seqc_write_begin(vp); 6244 } 6245 6246 void 6247 vop_remove_post(void *ap, int rc) 6248 { 6249 struct vop_remove_args *a; 6250 struct vnode *dvp, *vp; 6251 6252 a = ap; 6253 dvp = a->a_dvp; 6254 vp = a->a_vp; 6255 vn_seqc_write_end(dvp); 6256 vn_seqc_write_end(vp); 6257 if (!rc) { 6258 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6259 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6260 } 6261 } 6262 6263 void 6264 vop_rename_post(void *ap, int rc) 6265 { 6266 struct vop_rename_args *a = ap; 6267 long hint; 6268 6269 if (!rc) { 6270 hint = NOTE_WRITE; 6271 if (a->a_fdvp == a->a_tdvp) { 6272 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 6273 hint |= NOTE_LINK; 6274 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 6275 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 6276 } else { 6277 hint |= NOTE_EXTEND; 6278 if (a->a_fvp->v_type == VDIR) 6279 hint |= NOTE_LINK; 6280 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 6281 6282 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 6283 a->a_tvp->v_type == VDIR) 6284 hint &= ~NOTE_LINK; 6285 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 6286 } 6287 6288 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 6289 if (a->a_tvp) 6290 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 6291 } 6292 if (a->a_tdvp != a->a_fdvp) 6293 vdrop(a->a_fdvp); 6294 if (a->a_tvp != a->a_fvp) 6295 vdrop(a->a_fvp); 6296 vdrop(a->a_tdvp); 6297 if (a->a_tvp) 6298 vdrop(a->a_tvp); 6299 } 6300 6301 void 6302 vop_rmdir_pre(void *ap) 6303 { 6304 struct vop_rmdir_args *a; 6305 struct vnode *dvp, *vp; 6306 6307 a = ap; 6308 dvp = a->a_dvp; 6309 vp = a->a_vp; 6310 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); 6311 vn_seqc_write_begin(dvp); 6312 vn_seqc_write_begin(vp); 6313 } 6314 6315 void 6316 vop_rmdir_post(void *ap, int rc) 6317 { 6318 struct vop_rmdir_args *a; 6319 struct vnode *dvp, *vp; 6320 6321 a = ap; 6322 dvp = a->a_dvp; 6323 vp = a->a_vp; 6324 vn_seqc_write_end(dvp); 6325 vn_seqc_write_end(vp); 6326 if (!rc) { 6327 vp->v_vflag |= VV_UNLINKED; 6328 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6329 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6330 } 6331 } 6332 6333 void 6334 vop_setattr_pre(void *ap) 6335 { 6336 struct vop_setattr_args *a; 6337 struct vnode *vp; 6338 6339 a = ap; 6340 vp = a->a_vp; 6341 vn_seqc_write_begin(vp); 6342 } 6343 6344 void 6345 vop_setattr_post(void *ap, int rc) 6346 { 6347 struct vop_setattr_args *a; 6348 struct vnode *vp; 6349 6350 a = ap; 6351 vp = a->a_vp; 6352 vn_seqc_write_end(vp); 6353 if (!rc) 6354 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6355 } 6356 6357 void 6358 vop_setacl_pre(void *ap) 6359 { 6360 struct vop_setacl_args *a; 6361 struct vnode *vp; 6362 6363 a = ap; 6364 vp = a->a_vp; 6365 vn_seqc_write_begin(vp); 6366 } 6367 6368 void 6369 vop_setacl_post(void *ap, int rc __unused) 6370 { 6371 struct vop_setacl_args *a; 6372 struct vnode *vp; 6373 6374 a = ap; 6375 vp = a->a_vp; 6376 vn_seqc_write_end(vp); 6377 } 6378 6379 void 6380 vop_setextattr_pre(void *ap) 6381 { 6382 struct vop_setextattr_args *a; 6383 struct vnode *vp; 6384 6385 a = ap; 6386 vp = a->a_vp; 6387 vn_seqc_write_begin(vp); 6388 } 6389 6390 void 6391 vop_setextattr_post(void *ap, int rc) 6392 { 6393 struct vop_setextattr_args *a; 6394 struct vnode *vp; 6395 6396 a = ap; 6397 vp = a->a_vp; 6398 vn_seqc_write_end(vp); 6399 if (!rc) 6400 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6401 } 6402 6403 void 6404 vop_symlink_pre(void *ap) 6405 { 6406 struct vop_symlink_args *a; 6407 struct vnode *dvp; 6408 6409 a = ap; 6410 dvp = a->a_dvp; 6411 vn_seqc_write_begin(dvp); 6412 } 6413 6414 void 6415 vop_symlink_post(void *ap, int rc) 6416 { 6417 struct vop_symlink_args *a; 6418 struct vnode *dvp; 6419 6420 a = ap; 6421 dvp = a->a_dvp; 6422 vn_seqc_write_end(dvp); 6423 if (!rc) 6424 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6425 } 6426 6427 void 6428 vop_open_post(void *ap, int rc) 6429 { 6430 struct vop_open_args *a = ap; 6431 6432 if (!rc) 6433 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6434 } 6435 6436 void 6437 vop_close_post(void *ap, int rc) 6438 { 6439 struct vop_close_args *a = ap; 6440 6441 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6442 !VN_IS_DOOMED(a->a_vp))) { 6443 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6444 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6445 } 6446 } 6447 6448 void 6449 vop_read_post(void *ap, int rc) 6450 { 6451 struct vop_read_args *a = ap; 6452 6453 if (!rc) 6454 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6455 } 6456 6457 void 6458 vop_read_pgcache_post(void *ap, int rc) 6459 { 6460 struct vop_read_pgcache_args *a = ap; 6461 6462 if (!rc) 6463 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); 6464 } 6465 6466 void 6467 vop_readdir_post(void *ap, int rc) 6468 { 6469 struct vop_readdir_args *a = ap; 6470 6471 if (!rc) 6472 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6473 } 6474 6475 static struct knlist fs_knlist; 6476 6477 static void 6478 vfs_event_init(void *arg) 6479 { 6480 knlist_init_mtx(&fs_knlist, NULL); 6481 } 6482 /* XXX - correct order? */ 6483 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6484 6485 void 6486 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6487 { 6488 6489 KNOTE_UNLOCKED(&fs_knlist, event); 6490 } 6491 6492 static int filt_fsattach(struct knote *kn); 6493 static void filt_fsdetach(struct knote *kn); 6494 static int filt_fsevent(struct knote *kn, long hint); 6495 6496 const struct filterops fs_filtops = { 6497 .f_isfd = 0, 6498 .f_attach = filt_fsattach, 6499 .f_detach = filt_fsdetach, 6500 .f_event = filt_fsevent, 6501 }; 6502 6503 static int 6504 filt_fsattach(struct knote *kn) 6505 { 6506 6507 kn->kn_flags |= EV_CLEAR; 6508 knlist_add(&fs_knlist, kn, 0); 6509 return (0); 6510 } 6511 6512 static void 6513 filt_fsdetach(struct knote *kn) 6514 { 6515 6516 knlist_remove(&fs_knlist, kn, 0); 6517 } 6518 6519 static int 6520 filt_fsevent(struct knote *kn, long hint) 6521 { 6522 6523 kn->kn_fflags |= kn->kn_sfflags & hint; 6524 6525 return (kn->kn_fflags != 0); 6526 } 6527 6528 static int 6529 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6530 { 6531 struct vfsidctl vc; 6532 int error; 6533 struct mount *mp; 6534 6535 if (req->newptr == NULL) 6536 return (EINVAL); 6537 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6538 if (error) 6539 return (error); 6540 if (vc.vc_vers != VFS_CTL_VERS1) 6541 return (EINVAL); 6542 mp = vfs_getvfs(&vc.vc_fsid); 6543 if (mp == NULL) 6544 return (ENOENT); 6545 /* ensure that a specific sysctl goes to the right filesystem. */ 6546 if (strcmp(vc.vc_fstypename, "*") != 0 && 6547 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6548 vfs_rel(mp); 6549 return (EINVAL); 6550 } 6551 VCTLTOREQ(&vc, req); 6552 error = VFS_SYSCTL(mp, vc.vc_op, req); 6553 vfs_rel(mp); 6554 return (error); 6555 } 6556 6557 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6558 NULL, 0, sysctl_vfs_ctl, "", 6559 "Sysctl by fsid"); 6560 6561 /* 6562 * Function to initialize a va_filerev field sensibly. 6563 * XXX: Wouldn't a random number make a lot more sense ?? 6564 */ 6565 u_quad_t 6566 init_va_filerev(void) 6567 { 6568 struct bintime bt; 6569 6570 getbinuptime(&bt); 6571 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6572 } 6573 6574 static int filt_vfsread(struct knote *kn, long hint); 6575 static int filt_vfswrite(struct knote *kn, long hint); 6576 static int filt_vfsvnode(struct knote *kn, long hint); 6577 static void filt_vfsdetach(struct knote *kn); 6578 static int filt_vfsdump(struct proc *p, struct knote *kn, 6579 struct kinfo_knote *kin); 6580 6581 static const struct filterops vfsread_filtops = { 6582 .f_isfd = 1, 6583 .f_detach = filt_vfsdetach, 6584 .f_event = filt_vfsread, 6585 .f_userdump = filt_vfsdump, 6586 }; 6587 static const struct filterops vfswrite_filtops = { 6588 .f_isfd = 1, 6589 .f_detach = filt_vfsdetach, 6590 .f_event = filt_vfswrite, 6591 .f_userdump = filt_vfsdump, 6592 }; 6593 static const struct filterops vfsvnode_filtops = { 6594 .f_isfd = 1, 6595 .f_detach = filt_vfsdetach, 6596 .f_event = filt_vfsvnode, 6597 .f_userdump = filt_vfsdump, 6598 }; 6599 6600 static void 6601 vfs_knllock(void *arg) 6602 { 6603 struct vnode *vp = arg; 6604 6605 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6606 } 6607 6608 static void 6609 vfs_knlunlock(void *arg) 6610 { 6611 struct vnode *vp = arg; 6612 6613 VOP_UNLOCK(vp); 6614 } 6615 6616 static void 6617 vfs_knl_assert_lock(void *arg, int what) 6618 { 6619 #ifdef DEBUG_VFS_LOCKS 6620 struct vnode *vp = arg; 6621 6622 if (what == LA_LOCKED) 6623 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6624 else 6625 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6626 #endif 6627 } 6628 6629 int 6630 vfs_kqfilter(struct vop_kqfilter_args *ap) 6631 { 6632 struct vnode *vp = ap->a_vp; 6633 struct knote *kn = ap->a_kn; 6634 struct knlist *knl; 6635 6636 KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && 6637 kn->kn_filter != EVFILT_WRITE), 6638 ("READ/WRITE filter on a FIFO leaked through")); 6639 switch (kn->kn_filter) { 6640 case EVFILT_READ: 6641 kn->kn_fop = &vfsread_filtops; 6642 break; 6643 case EVFILT_WRITE: 6644 kn->kn_fop = &vfswrite_filtops; 6645 break; 6646 case EVFILT_VNODE: 6647 kn->kn_fop = &vfsvnode_filtops; 6648 break; 6649 default: 6650 return (EINVAL); 6651 } 6652 6653 kn->kn_hook = (caddr_t)vp; 6654 6655 v_addpollinfo(vp); 6656 if (vp->v_pollinfo == NULL) 6657 return (ENOMEM); 6658 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6659 vhold(vp); 6660 knlist_add(knl, kn, 0); 6661 6662 return (0); 6663 } 6664 6665 /* 6666 * Detach knote from vnode 6667 */ 6668 static void 6669 filt_vfsdetach(struct knote *kn) 6670 { 6671 struct vnode *vp = (struct vnode *)kn->kn_hook; 6672 6673 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6674 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6675 vdrop(vp); 6676 } 6677 6678 /*ARGSUSED*/ 6679 static int 6680 filt_vfsread(struct knote *kn, long hint) 6681 { 6682 struct vnode *vp = (struct vnode *)kn->kn_hook; 6683 off_t size; 6684 int res; 6685 6686 /* 6687 * filesystem is gone, so set the EOF flag and schedule 6688 * the knote for deletion. 6689 */ 6690 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6691 VI_LOCK(vp); 6692 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6693 VI_UNLOCK(vp); 6694 return (1); 6695 } 6696 6697 if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0) 6698 return (0); 6699 6700 VI_LOCK(vp); 6701 kn->kn_data = size - kn->kn_fp->f_offset; 6702 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6703 VI_UNLOCK(vp); 6704 return (res); 6705 } 6706 6707 /*ARGSUSED*/ 6708 static int 6709 filt_vfswrite(struct knote *kn, long hint) 6710 { 6711 struct vnode *vp = (struct vnode *)kn->kn_hook; 6712 6713 VI_LOCK(vp); 6714 6715 /* 6716 * filesystem is gone, so set the EOF flag and schedule 6717 * the knote for deletion. 6718 */ 6719 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6720 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6721 6722 kn->kn_data = 0; 6723 VI_UNLOCK(vp); 6724 return (1); 6725 } 6726 6727 static int 6728 filt_vfsvnode(struct knote *kn, long hint) 6729 { 6730 struct vnode *vp = (struct vnode *)kn->kn_hook; 6731 int res; 6732 6733 VI_LOCK(vp); 6734 if (kn->kn_sfflags & hint) 6735 kn->kn_fflags |= hint; 6736 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6737 kn->kn_flags |= EV_EOF; 6738 VI_UNLOCK(vp); 6739 return (1); 6740 } 6741 res = (kn->kn_fflags != 0); 6742 VI_UNLOCK(vp); 6743 return (res); 6744 } 6745 6746 static int 6747 filt_vfsdump(struct proc *p, struct knote *kn, struct kinfo_knote *kin) 6748 { 6749 struct vattr va; 6750 struct vnode *vp; 6751 char *fullpath, *freepath; 6752 int error; 6753 6754 kin->knt_extdata = KNOTE_EXTDATA_VNODE; 6755 6756 vp = kn->kn_fp->f_vnode; 6757 kin->knt_vnode.knt_vnode_type = vntype_to_kinfo(vp->v_type); 6758 6759 va.va_fsid = VNOVAL; 6760 vn_lock(vp, LK_SHARED | LK_RETRY); 6761 error = VOP_GETATTR(vp, &va, curthread->td_ucred); 6762 VOP_UNLOCK(vp); 6763 if (error != 0) 6764 return (error); 6765 kin->knt_vnode.knt_vnode_fsid = va.va_fsid; 6766 kin->knt_vnode.knt_vnode_fileid = va.va_fileid; 6767 6768 freepath = NULL; 6769 fullpath = "-"; 6770 error = vn_fullpath(vp, &fullpath, &freepath); 6771 if (error == 0) { 6772 strlcpy(kin->knt_vnode.knt_vnode_fullpath, fullpath, 6773 sizeof(kin->knt_vnode.knt_vnode_fullpath)); 6774 } 6775 if (freepath != NULL) 6776 free(freepath, M_TEMP); 6777 6778 return (0); 6779 } 6780 6781 int 6782 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6783 { 6784 int error; 6785 6786 if (dp->d_reclen > ap->a_uio->uio_resid) 6787 return (ENAMETOOLONG); 6788 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6789 if (error) { 6790 if (ap->a_ncookies != NULL) { 6791 if (ap->a_cookies != NULL) 6792 free(ap->a_cookies, M_TEMP); 6793 ap->a_cookies = NULL; 6794 *ap->a_ncookies = 0; 6795 } 6796 return (error); 6797 } 6798 if (ap->a_ncookies == NULL) 6799 return (0); 6800 6801 KASSERT(ap->a_cookies, 6802 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6803 6804 *ap->a_cookies = realloc(*ap->a_cookies, 6805 (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); 6806 (*ap->a_cookies)[*ap->a_ncookies] = off; 6807 *ap->a_ncookies += 1; 6808 return (0); 6809 } 6810 6811 /* 6812 * The purpose of this routine is to remove granularity from accmode_t, 6813 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6814 * VADMIN and VAPPEND. 6815 * 6816 * If it returns 0, the caller is supposed to continue with the usual 6817 * access checks using 'accmode' as modified by this routine. If it 6818 * returns nonzero value, the caller is supposed to return that value 6819 * as errno. 6820 * 6821 * Note that after this routine runs, accmode may be zero. 6822 */ 6823 int 6824 vfs_unixify_accmode(accmode_t *accmode) 6825 { 6826 /* 6827 * There is no way to specify explicit "deny" rule using 6828 * file mode or POSIX.1e ACLs. 6829 */ 6830 if (*accmode & VEXPLICIT_DENY) { 6831 *accmode = 0; 6832 return (0); 6833 } 6834 6835 /* 6836 * None of these can be translated into usual access bits. 6837 * Also, the common case for NFSv4 ACLs is to not contain 6838 * either of these bits. Caller should check for VWRITE 6839 * on the containing directory instead. 6840 */ 6841 if (*accmode & (VDELETE_CHILD | VDELETE)) 6842 return (EPERM); 6843 6844 if (*accmode & VADMIN_PERMS) { 6845 *accmode &= ~VADMIN_PERMS; 6846 *accmode |= VADMIN; 6847 } 6848 6849 /* 6850 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6851 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6852 */ 6853 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6854 6855 return (0); 6856 } 6857 6858 /* 6859 * Clear out a doomed vnode (if any) and replace it with a new one as long 6860 * as the fs is not being unmounted. Return the root vnode to the caller. 6861 */ 6862 static int __noinline 6863 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6864 { 6865 struct vnode *vp; 6866 int error; 6867 6868 restart: 6869 if (mp->mnt_rootvnode != NULL) { 6870 MNT_ILOCK(mp); 6871 vp = mp->mnt_rootvnode; 6872 if (vp != NULL) { 6873 if (!VN_IS_DOOMED(vp)) { 6874 vrefact(vp); 6875 MNT_IUNLOCK(mp); 6876 error = vn_lock(vp, flags); 6877 if (error == 0) { 6878 *vpp = vp; 6879 return (0); 6880 } 6881 vrele(vp); 6882 goto restart; 6883 } 6884 /* 6885 * Clear the old one. 6886 */ 6887 mp->mnt_rootvnode = NULL; 6888 } 6889 MNT_IUNLOCK(mp); 6890 if (vp != NULL) { 6891 vfs_op_barrier_wait(mp); 6892 vrele(vp); 6893 } 6894 } 6895 error = VFS_CACHEDROOT(mp, flags, vpp); 6896 if (error != 0) 6897 return (error); 6898 if (mp->mnt_vfs_ops == 0) { 6899 MNT_ILOCK(mp); 6900 if (mp->mnt_vfs_ops != 0) { 6901 MNT_IUNLOCK(mp); 6902 return (0); 6903 } 6904 if (mp->mnt_rootvnode == NULL) { 6905 vrefact(*vpp); 6906 mp->mnt_rootvnode = *vpp; 6907 } else { 6908 if (mp->mnt_rootvnode != *vpp) { 6909 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6910 panic("%s: mismatch between vnode returned " 6911 " by VFS_CACHEDROOT and the one cached " 6912 " (%p != %p)", 6913 __func__, *vpp, mp->mnt_rootvnode); 6914 } 6915 } 6916 } 6917 MNT_IUNLOCK(mp); 6918 } 6919 return (0); 6920 } 6921 6922 int 6923 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6924 { 6925 struct mount_pcpu *mpcpu; 6926 struct vnode *vp; 6927 int error; 6928 6929 if (!vfs_op_thread_enter(mp, mpcpu)) 6930 return (vfs_cache_root_fallback(mp, flags, vpp)); 6931 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6932 if (vp == NULL || VN_IS_DOOMED(vp)) { 6933 vfs_op_thread_exit(mp, mpcpu); 6934 return (vfs_cache_root_fallback(mp, flags, vpp)); 6935 } 6936 vrefact(vp); 6937 vfs_op_thread_exit(mp, mpcpu); 6938 error = vn_lock(vp, flags); 6939 if (error != 0) { 6940 vrele(vp); 6941 return (vfs_cache_root_fallback(mp, flags, vpp)); 6942 } 6943 *vpp = vp; 6944 return (0); 6945 } 6946 6947 struct vnode * 6948 vfs_cache_root_clear(struct mount *mp) 6949 { 6950 struct vnode *vp; 6951 6952 /* 6953 * ops > 0 guarantees there is nobody who can see this vnode 6954 */ 6955 MPASS(mp->mnt_vfs_ops > 0); 6956 vp = mp->mnt_rootvnode; 6957 if (vp != NULL) 6958 vn_seqc_write_begin(vp); 6959 mp->mnt_rootvnode = NULL; 6960 return (vp); 6961 } 6962 6963 void 6964 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 6965 { 6966 6967 MPASS(mp->mnt_vfs_ops > 0); 6968 vrefact(vp); 6969 mp->mnt_rootvnode = vp; 6970 } 6971 6972 /* 6973 * These are helper functions for filesystems to traverse all 6974 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 6975 * 6976 * This interface replaces MNT_VNODE_FOREACH. 6977 */ 6978 6979 struct vnode * 6980 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 6981 { 6982 struct vnode *vp; 6983 6984 maybe_yield(); 6985 MNT_ILOCK(mp); 6986 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6987 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 6988 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 6989 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6990 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6991 continue; 6992 VI_LOCK(vp); 6993 if (VN_IS_DOOMED(vp)) { 6994 VI_UNLOCK(vp); 6995 continue; 6996 } 6997 break; 6998 } 6999 if (vp == NULL) { 7000 __mnt_vnode_markerfree_all(mvp, mp); 7001 /* MNT_IUNLOCK(mp); -- done in above function */ 7002 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 7003 return (NULL); 7004 } 7005 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 7006 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 7007 MNT_IUNLOCK(mp); 7008 return (vp); 7009 } 7010 7011 struct vnode * 7012 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 7013 { 7014 struct vnode *vp; 7015 7016 *mvp = vn_alloc_marker(mp); 7017 MNT_ILOCK(mp); 7018 MNT_REF(mp); 7019 7020 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 7021 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 7022 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 7023 continue; 7024 VI_LOCK(vp); 7025 if (VN_IS_DOOMED(vp)) { 7026 VI_UNLOCK(vp); 7027 continue; 7028 } 7029 break; 7030 } 7031 if (vp == NULL) { 7032 MNT_REL(mp); 7033 MNT_IUNLOCK(mp); 7034 vn_free_marker(*mvp); 7035 *mvp = NULL; 7036 return (NULL); 7037 } 7038 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 7039 MNT_IUNLOCK(mp); 7040 return (vp); 7041 } 7042 7043 void 7044 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 7045 { 7046 7047 if (*mvp == NULL) { 7048 MNT_IUNLOCK(mp); 7049 return; 7050 } 7051 7052 mtx_assert(MNT_MTX(mp), MA_OWNED); 7053 7054 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7055 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 7056 MNT_REL(mp); 7057 MNT_IUNLOCK(mp); 7058 vn_free_marker(*mvp); 7059 *mvp = NULL; 7060 } 7061 7062 /* 7063 * These are helper functions for filesystems to traverse their 7064 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 7065 */ 7066 static void 7067 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 7068 { 7069 7070 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7071 7072 MNT_ILOCK(mp); 7073 MNT_REL(mp); 7074 MNT_IUNLOCK(mp); 7075 vn_free_marker(*mvp); 7076 *mvp = NULL; 7077 } 7078 7079 /* 7080 * Relock the mp mount vnode list lock with the vp vnode interlock in the 7081 * conventional lock order during mnt_vnode_next_lazy iteration. 7082 * 7083 * On entry, the mount vnode list lock is held and the vnode interlock is not. 7084 * The list lock is dropped and reacquired. On success, both locks are held. 7085 * On failure, the mount vnode list lock is held but the vnode interlock is 7086 * not, and the procedure may have yielded. 7087 */ 7088 static bool 7089 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 7090 struct vnode *vp) 7091 { 7092 7093 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 7094 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 7095 ("%s: bad marker", __func__)); 7096 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 7097 ("%s: inappropriate vnode", __func__)); 7098 ASSERT_VI_UNLOCKED(vp, __func__); 7099 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 7100 7101 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 7102 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 7103 7104 /* 7105 * Note we may be racing against vdrop which transitioned the hold 7106 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 7107 * if we are the only user after we get the interlock we will just 7108 * vdrop. 7109 */ 7110 vhold(vp); 7111 mtx_unlock(&mp->mnt_listmtx); 7112 VI_LOCK(vp); 7113 if (VN_IS_DOOMED(vp)) { 7114 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 7115 goto out_lost; 7116 } 7117 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 7118 /* 7119 * There is nothing to do if we are the last user. 7120 */ 7121 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 7122 goto out_lost; 7123 mtx_lock(&mp->mnt_listmtx); 7124 return (true); 7125 out_lost: 7126 vdropl(vp); 7127 maybe_yield(); 7128 mtx_lock(&mp->mnt_listmtx); 7129 return (false); 7130 } 7131 7132 static struct vnode * 7133 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7134 void *cbarg) 7135 { 7136 struct vnode *vp; 7137 7138 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 7139 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7140 restart: 7141 vp = TAILQ_NEXT(*mvp, v_lazylist); 7142 while (vp != NULL) { 7143 if (vp->v_type == VMARKER) { 7144 vp = TAILQ_NEXT(vp, v_lazylist); 7145 continue; 7146 } 7147 /* 7148 * See if we want to process the vnode. Note we may encounter a 7149 * long string of vnodes we don't care about and hog the list 7150 * as a result. Check for it and requeue the marker. 7151 */ 7152 VNPASS(!VN_IS_DOOMED(vp), vp); 7153 if (!cb(vp, cbarg)) { 7154 if (!should_yield()) { 7155 vp = TAILQ_NEXT(vp, v_lazylist); 7156 continue; 7157 } 7158 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 7159 v_lazylist); 7160 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 7161 v_lazylist); 7162 mtx_unlock(&mp->mnt_listmtx); 7163 kern_yield(PRI_USER); 7164 mtx_lock(&mp->mnt_listmtx); 7165 goto restart; 7166 } 7167 /* 7168 * Try-lock because this is the wrong lock order. 7169 */ 7170 if (!VI_TRYLOCK(vp) && 7171 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 7172 goto restart; 7173 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 7174 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 7175 ("alien vnode on the lazy list %p %p", vp, mp)); 7176 VNPASS(vp->v_mount == mp, vp); 7177 VNPASS(!VN_IS_DOOMED(vp), vp); 7178 break; 7179 } 7180 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 7181 7182 /* Check if we are done */ 7183 if (vp == NULL) { 7184 mtx_unlock(&mp->mnt_listmtx); 7185 mnt_vnode_markerfree_lazy(mvp, mp); 7186 return (NULL); 7187 } 7188 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 7189 mtx_unlock(&mp->mnt_listmtx); 7190 ASSERT_VI_LOCKED(vp, "lazy iter"); 7191 return (vp); 7192 } 7193 7194 struct vnode * 7195 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7196 void *cbarg) 7197 { 7198 7199 maybe_yield(); 7200 mtx_lock(&mp->mnt_listmtx); 7201 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 7202 } 7203 7204 struct vnode * 7205 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7206 void *cbarg) 7207 { 7208 struct vnode *vp; 7209 7210 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 7211 return (NULL); 7212 7213 *mvp = vn_alloc_marker(mp); 7214 MNT_ILOCK(mp); 7215 MNT_REF(mp); 7216 MNT_IUNLOCK(mp); 7217 7218 mtx_lock(&mp->mnt_listmtx); 7219 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 7220 if (vp == NULL) { 7221 mtx_unlock(&mp->mnt_listmtx); 7222 mnt_vnode_markerfree_lazy(mvp, mp); 7223 return (NULL); 7224 } 7225 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 7226 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 7227 } 7228 7229 void 7230 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 7231 { 7232 7233 if (*mvp == NULL) 7234 return; 7235 7236 mtx_lock(&mp->mnt_listmtx); 7237 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 7238 mtx_unlock(&mp->mnt_listmtx); 7239 mnt_vnode_markerfree_lazy(mvp, mp); 7240 } 7241 7242 int 7243 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 7244 { 7245 7246 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 7247 cnp->cn_flags &= ~NOEXECCHECK; 7248 return (0); 7249 } 7250 7251 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); 7252 } 7253 7254 /* 7255 * Do not use this variant unless you have means other than the hold count 7256 * to prevent the vnode from getting freed. 7257 */ 7258 void 7259 vn_seqc_write_begin_locked(struct vnode *vp) 7260 { 7261 7262 ASSERT_VI_LOCKED(vp, __func__); 7263 VNPASS(vp->v_holdcnt > 0, vp); 7264 VNPASS(vp->v_seqc_users >= 0, vp); 7265 vp->v_seqc_users++; 7266 if (vp->v_seqc_users == 1) 7267 seqc_sleepable_write_begin(&vp->v_seqc); 7268 } 7269 7270 void 7271 vn_seqc_write_begin(struct vnode *vp) 7272 { 7273 7274 VI_LOCK(vp); 7275 vn_seqc_write_begin_locked(vp); 7276 VI_UNLOCK(vp); 7277 } 7278 7279 void 7280 vn_seqc_write_end_locked(struct vnode *vp) 7281 { 7282 7283 ASSERT_VI_LOCKED(vp, __func__); 7284 VNPASS(vp->v_seqc_users > 0, vp); 7285 vp->v_seqc_users--; 7286 if (vp->v_seqc_users == 0) 7287 seqc_sleepable_write_end(&vp->v_seqc); 7288 } 7289 7290 void 7291 vn_seqc_write_end(struct vnode *vp) 7292 { 7293 7294 VI_LOCK(vp); 7295 vn_seqc_write_end_locked(vp); 7296 VI_UNLOCK(vp); 7297 } 7298 7299 /* 7300 * Special case handling for allocating and freeing vnodes. 7301 * 7302 * The counter remains unchanged on free so that a doomed vnode will 7303 * keep testing as in modify as long as it is accessible with SMR. 7304 */ 7305 static void 7306 vn_seqc_init(struct vnode *vp) 7307 { 7308 7309 vp->v_seqc = 0; 7310 vp->v_seqc_users = 0; 7311 } 7312 7313 static void 7314 vn_seqc_write_end_free(struct vnode *vp) 7315 { 7316 7317 VNPASS(seqc_in_modify(vp->v_seqc), vp); 7318 VNPASS(vp->v_seqc_users == 1, vp); 7319 } 7320 7321 void 7322 vn_irflag_set_locked(struct vnode *vp, short toset) 7323 { 7324 short flags; 7325 7326 ASSERT_VI_LOCKED(vp, __func__); 7327 flags = vn_irflag_read(vp); 7328 VNASSERT((flags & toset) == 0, vp, 7329 ("%s: some of the passed flags already set (have %d, passed %d)\n", 7330 __func__, flags, toset)); 7331 atomic_store_short(&vp->v_irflag, flags | toset); 7332 } 7333 7334 void 7335 vn_irflag_set(struct vnode *vp, short toset) 7336 { 7337 7338 VI_LOCK(vp); 7339 vn_irflag_set_locked(vp, toset); 7340 VI_UNLOCK(vp); 7341 } 7342 7343 void 7344 vn_irflag_set_cond_locked(struct vnode *vp, short toset) 7345 { 7346 short flags; 7347 7348 ASSERT_VI_LOCKED(vp, __func__); 7349 flags = vn_irflag_read(vp); 7350 atomic_store_short(&vp->v_irflag, flags | toset); 7351 } 7352 7353 void 7354 vn_irflag_set_cond(struct vnode *vp, short toset) 7355 { 7356 7357 VI_LOCK(vp); 7358 vn_irflag_set_cond_locked(vp, toset); 7359 VI_UNLOCK(vp); 7360 } 7361 7362 void 7363 vn_irflag_unset_locked(struct vnode *vp, short tounset) 7364 { 7365 short flags; 7366 7367 ASSERT_VI_LOCKED(vp, __func__); 7368 flags = vn_irflag_read(vp); 7369 VNASSERT((flags & tounset) == tounset, vp, 7370 ("%s: some of the passed flags not set (have %d, passed %d)\n", 7371 __func__, flags, tounset)); 7372 atomic_store_short(&vp->v_irflag, flags & ~tounset); 7373 } 7374 7375 void 7376 vn_irflag_unset(struct vnode *vp, short tounset) 7377 { 7378 7379 VI_LOCK(vp); 7380 vn_irflag_unset_locked(vp, tounset); 7381 VI_UNLOCK(vp); 7382 } 7383 7384 int 7385 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred) 7386 { 7387 struct vattr vattr; 7388 int error; 7389 7390 ASSERT_VOP_LOCKED(vp, __func__); 7391 error = VOP_GETATTR(vp, &vattr, cred); 7392 if (__predict_true(error == 0)) { 7393 if (vattr.va_size <= OFF_MAX) 7394 *size = vattr.va_size; 7395 else 7396 error = EFBIG; 7397 } 7398 return (error); 7399 } 7400 7401 int 7402 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred) 7403 { 7404 int error; 7405 7406 VOP_LOCK(vp, LK_SHARED); 7407 error = vn_getsize_locked(vp, size, cred); 7408 VOP_UNLOCK(vp); 7409 return (error); 7410 } 7411 7412 #ifdef INVARIANTS 7413 void 7414 vn_set_state_validate(struct vnode *vp, __enum_uint8(vstate) state) 7415 { 7416 7417 switch (vp->v_state) { 7418 case VSTATE_UNINITIALIZED: 7419 switch (state) { 7420 case VSTATE_CONSTRUCTED: 7421 case VSTATE_DESTROYING: 7422 return; 7423 default: 7424 break; 7425 } 7426 break; 7427 case VSTATE_CONSTRUCTED: 7428 ASSERT_VOP_ELOCKED(vp, __func__); 7429 switch (state) { 7430 case VSTATE_DESTROYING: 7431 return; 7432 default: 7433 break; 7434 } 7435 break; 7436 case VSTATE_DESTROYING: 7437 ASSERT_VOP_ELOCKED(vp, __func__); 7438 switch (state) { 7439 case VSTATE_DEAD: 7440 return; 7441 default: 7442 break; 7443 } 7444 break; 7445 case VSTATE_DEAD: 7446 switch (state) { 7447 case VSTATE_UNINITIALIZED: 7448 return; 7449 default: 7450 break; 7451 } 7452 break; 7453 } 7454 7455 vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state); 7456 panic("invalid state transition %d -> %d\n", vp->v_state, state); 7457 } 7458 #endif 7459