1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include "opt_ddb.h" 42 #include "opt_watchdog.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/asan.h> 47 #include <sys/bio.h> 48 #include <sys/buf.h> 49 #include <sys/capsicum.h> 50 #include <sys/condvar.h> 51 #include <sys/conf.h> 52 #include <sys/counter.h> 53 #include <sys/dirent.h> 54 #include <sys/event.h> 55 #include <sys/eventhandler.h> 56 #include <sys/extattr.h> 57 #include <sys/file.h> 58 #include <sys/fcntl.h> 59 #include <sys/inotify.h> 60 #include <sys/jail.h> 61 #include <sys/kdb.h> 62 #include <sys/kernel.h> 63 #include <sys/kthread.h> 64 #include <sys/ktr.h> 65 #include <sys/limits.h> 66 #include <sys/lockf.h> 67 #include <sys/malloc.h> 68 #include <sys/mount.h> 69 #include <sys/namei.h> 70 #include <sys/pctrie.h> 71 #include <sys/priv.h> 72 #include <sys/reboot.h> 73 #include <sys/refcount.h> 74 #include <sys/rwlock.h> 75 #include <sys/sched.h> 76 #include <sys/sleepqueue.h> 77 #include <sys/smr.h> 78 #include <sys/smp.h> 79 #include <sys/stat.h> 80 #include <sys/stdarg.h> 81 #include <sys/sysctl.h> 82 #include <sys/syslog.h> 83 #include <sys/user.h> 84 #include <sys/vmmeter.h> 85 #include <sys/vnode.h> 86 #include <sys/watchdog.h> 87 88 #include <security/mac/mac_framework.h> 89 90 #include <vm/vm.h> 91 #include <vm/vm_object.h> 92 #include <vm/vm_extern.h> 93 #include <vm/pmap.h> 94 #include <vm/vm_map.h> 95 #include <vm/vm_page.h> 96 #include <vm/vm_kern.h> 97 #include <vm/vnode_pager.h> 98 #include <vm/uma.h> 99 100 #ifdef DDB 101 #include <ddb/ddb.h> 102 #endif 103 104 static void delmntque(struct vnode *vp); 105 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 106 int slpflag, int slptimeo); 107 static void syncer_shutdown(void *arg, int howto); 108 static int vtryrecycle(struct vnode *vp, bool isvnlru); 109 static void v_init_counters(struct vnode *); 110 static void vn_seqc_init(struct vnode *); 111 static void vn_seqc_write_end_free(struct vnode *vp); 112 static void vgonel(struct vnode *); 113 static bool vhold_recycle_free(struct vnode *); 114 static void vdropl_recycle(struct vnode *vp); 115 static void vdrop_recycle(struct vnode *vp); 116 static void vfs_knllock(void *arg); 117 static void vfs_knlunlock(void *arg); 118 static void vfs_knl_assert_lock(void *arg, int what); 119 static void destroy_vpollinfo(struct vpollinfo *vi); 120 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 121 daddr_t startlbn, daddr_t endlbn); 122 static void vnlru_recalc(void); 123 124 static SYSCTL_NODE(_vfs, OID_AUTO, vnode, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 125 "vnode configuration and statistics"); 126 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 127 "vnode configuration"); 128 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 129 "vnode statistics"); 130 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, vnlru, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 131 "vnode recycling"); 132 133 /* 134 * Number of vnodes in existence. Increased whenever getnewvnode() 135 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 136 */ 137 static u_long __exclusive_cache_line numvnodes; 138 139 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 140 "Number of vnodes in existence (legacy)"); 141 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, count, CTLFLAG_RD, &numvnodes, 0, 142 "Number of vnodes in existence"); 143 144 static counter_u64_t vnodes_created; 145 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 146 "Number of vnodes created by getnewvnode (legacy)"); 147 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, created, CTLFLAG_RD, &vnodes_created, 148 "Number of vnodes created by getnewvnode"); 149 150 /* 151 * Conversion tables for conversion from vnode types to inode formats 152 * and back. 153 */ 154 __enum_uint8(vtype) iftovt_tab[16] = { 155 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 156 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 157 }; 158 int vttoif_tab[10] = { 159 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 160 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 161 }; 162 163 /* 164 * List of allocates vnodes in the system. 165 */ 166 static TAILQ_HEAD(freelst, vnode) vnode_list; 167 static struct vnode *vnode_list_free_marker; 168 static struct vnode *vnode_list_reclaim_marker; 169 170 /* 171 * "Free" vnode target. Free vnodes are rarely completely free, but are 172 * just ones that are cheap to recycle. Usually they are for files which 173 * have been stat'd but not read; these usually have inode and namecache 174 * data attached to them. This target is the preferred minimum size of a 175 * sub-cache consisting mostly of such files. The system balances the size 176 * of this sub-cache with its complement to try to prevent either from 177 * thrashing while the other is relatively inactive. The targets express 178 * a preference for the best balance. 179 * 180 * "Above" this target there are 2 further targets (watermarks) related 181 * to recyling of free vnodes. In the best-operating case, the cache is 182 * exactly full, the free list has size between vlowat and vhiwat above the 183 * free target, and recycling from it and normal use maintains this state. 184 * Sometimes the free list is below vlowat or even empty, but this state 185 * is even better for immediate use provided the cache is not full. 186 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 187 * ones) to reach one of these states. The watermarks are currently hard- 188 * coded as 4% and 9% of the available space higher. These and the default 189 * of 25% for wantfreevnodes are too large if the memory size is large. 190 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 191 * whenever vnlru_proc() becomes active. 192 */ 193 static long wantfreevnodes; 194 static long __exclusive_cache_line freevnodes; 195 static long freevnodes_old; 196 197 static u_long recycles_count; 198 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, &recycles_count, 0, 199 "Number of vnodes recycled to meet vnode cache targets (legacy)"); 200 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, 201 &recycles_count, 0, 202 "Number of vnodes recycled to meet vnode cache targets"); 203 204 static u_long recycles_free_count; 205 SYSCTL_ULONG(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, 206 &recycles_free_count, 0, 207 "Number of free vnodes recycled to meet vnode cache targets (legacy)"); 208 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, 209 &recycles_free_count, 0, 210 "Number of free vnodes recycled to meet vnode cache targets"); 211 212 static counter_u64_t direct_recycles_free_count; 213 SYSCTL_COUNTER_U64(_vfs_vnode_vnlru, OID_AUTO, direct_recycles_free, CTLFLAG_RD, 214 &direct_recycles_free_count, 215 "Number of free vnodes recycled by vn_alloc callers to meet vnode cache targets"); 216 217 static counter_u64_t vnode_skipped_requeues; 218 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, skipped_requeues, CTLFLAG_RD, &vnode_skipped_requeues, 219 "Number of times LRU requeue was skipped due to lock contention"); 220 221 static __read_mostly bool vnode_can_skip_requeue; 222 SYSCTL_BOOL(_vfs_vnode_param, OID_AUTO, can_skip_requeue, CTLFLAG_RW, 223 &vnode_can_skip_requeue, 0, "Is LRU requeue skippable"); 224 225 static u_long deferred_inact; 226 SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, 227 &deferred_inact, 0, "Number of times inactive processing was deferred"); 228 229 /* To keep more than one thread at a time from running vfs_getnewfsid */ 230 static struct mtx mntid_mtx; 231 232 /* 233 * Lock for any access to the following: 234 * vnode_list 235 * numvnodes 236 * freevnodes 237 */ 238 static struct mtx __exclusive_cache_line vnode_list_mtx; 239 240 /* Publicly exported FS */ 241 struct nfs_public nfs_pub; 242 243 static uma_zone_t buf_trie_zone; 244 static smr_t buf_trie_smr; 245 246 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 247 static uma_zone_t vnode_zone; 248 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); 249 250 __read_frequently smr_t vfs_smr; 251 252 /* 253 * The workitem queue. 254 * 255 * It is useful to delay writes of file data and filesystem metadata 256 * for tens of seconds so that quickly created and deleted files need 257 * not waste disk bandwidth being created and removed. To realize this, 258 * we append vnodes to a "workitem" queue. When running with a soft 259 * updates implementation, most pending metadata dependencies should 260 * not wait for more than a few seconds. Thus, mounted on block devices 261 * are delayed only about a half the time that file data is delayed. 262 * Similarly, directory updates are more critical, so are only delayed 263 * about a third the time that file data is delayed. Thus, there are 264 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 265 * one each second (driven off the filesystem syncer process). The 266 * syncer_delayno variable indicates the next queue that is to be processed. 267 * Items that need to be processed soon are placed in this queue: 268 * 269 * syncer_workitem_pending[syncer_delayno] 270 * 271 * A delay of fifteen seconds is done by placing the request fifteen 272 * entries later in the queue: 273 * 274 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 275 * 276 */ 277 static int syncer_delayno; 278 static long syncer_mask; 279 LIST_HEAD(synclist, bufobj); 280 static struct synclist *syncer_workitem_pending; 281 /* 282 * The sync_mtx protects: 283 * bo->bo_synclist 284 * sync_vnode_count 285 * syncer_delayno 286 * syncer_state 287 * syncer_workitem_pending 288 * syncer_worklist_len 289 * rushjob 290 */ 291 static struct mtx sync_mtx; 292 static struct cv sync_wakeup; 293 294 #define SYNCER_MAXDELAY 32 295 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 296 static int syncdelay = 30; /* max time to delay syncing data */ 297 static int filedelay = 30; /* time to delay syncing files */ 298 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 299 "Time to delay syncing files (in seconds)"); 300 static int dirdelay = 29; /* time to delay syncing directories */ 301 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 302 "Time to delay syncing directories (in seconds)"); 303 static int metadelay = 28; /* time to delay syncing metadata */ 304 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 305 "Time to delay syncing metadata (in seconds)"); 306 static int rushjob; /* number of slots to run ASAP */ 307 static int stat_rush_requests; /* number of times I/O speeded up */ 308 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 309 "Number of times I/O speeded up (rush requests)"); 310 311 #define VDBATCH_SIZE 8 312 struct vdbatch { 313 u_int index; 314 struct mtx lock; 315 struct vnode *tab[VDBATCH_SIZE]; 316 }; 317 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 318 319 static void vdbatch_dequeue(struct vnode *vp); 320 321 /* 322 * The syncer will require at least SYNCER_MAXDELAY iterations to shutdown; 323 * we probably don't want to pause for the whole second each time. 324 */ 325 #define SYNCER_SHUTDOWN_SPEEDUP 32 326 static int sync_vnode_count; 327 static int syncer_worklist_len; 328 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 329 syncer_state; 330 331 /* Target for maximum number of vnodes. */ 332 u_long desiredvnodes; 333 static u_long gapvnodes; /* gap between wanted and desired */ 334 static u_long vhiwat; /* enough extras after expansion */ 335 static u_long vlowat; /* minimal extras before expansion */ 336 static bool vstir; /* nonzero to stir non-free vnodes */ 337 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 338 339 static u_long vnlru_read_freevnodes(void); 340 341 /* 342 * Note that no attempt is made to sanitize these parameters. 343 */ 344 static int 345 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 346 { 347 u_long val; 348 int error; 349 350 val = desiredvnodes; 351 error = sysctl_handle_long(oidp, &val, 0, req); 352 if (error != 0 || req->newptr == NULL) 353 return (error); 354 355 if (val == desiredvnodes) 356 return (0); 357 mtx_lock(&vnode_list_mtx); 358 desiredvnodes = val; 359 wantfreevnodes = desiredvnodes / 4; 360 vnlru_recalc(); 361 mtx_unlock(&vnode_list_mtx); 362 /* 363 * XXX There is no protection against multiple threads changing 364 * desiredvnodes at the same time. Locking above only helps vnlru and 365 * getnewvnode. 366 */ 367 vfs_hash_changesize(desiredvnodes); 368 cache_changesize(desiredvnodes); 369 return (0); 370 } 371 372 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 373 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 374 "LU", "Target for maximum number of vnodes (legacy)"); 375 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, limit, 376 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 377 "LU", "Target for maximum number of vnodes"); 378 379 static int 380 sysctl_freevnodes(SYSCTL_HANDLER_ARGS) 381 { 382 u_long rfreevnodes; 383 384 rfreevnodes = vnlru_read_freevnodes(); 385 return (sysctl_handle_long(oidp, &rfreevnodes, 0, req)); 386 } 387 388 SYSCTL_PROC(_vfs, OID_AUTO, freevnodes, 389 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, 390 "LU", "Number of \"free\" vnodes (legacy)"); 391 SYSCTL_PROC(_vfs_vnode_stats, OID_AUTO, free, 392 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, 393 "LU", "Number of \"free\" vnodes"); 394 395 static int 396 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 397 { 398 u_long val; 399 int error; 400 401 val = wantfreevnodes; 402 error = sysctl_handle_long(oidp, &val, 0, req); 403 if (error != 0 || req->newptr == NULL) 404 return (error); 405 406 if (val == wantfreevnodes) 407 return (0); 408 mtx_lock(&vnode_list_mtx); 409 wantfreevnodes = val; 410 vnlru_recalc(); 411 mtx_unlock(&vnode_list_mtx); 412 return (0); 413 } 414 415 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 416 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 417 "LU", "Target for minimum number of \"free\" vnodes (legacy)"); 418 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, wantfree, 419 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 420 "LU", "Target for minimum number of \"free\" vnodes"); 421 422 static int vnlru_nowhere; 423 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, failed_runs, CTLFLAG_RD | CTLFLAG_STATS, 424 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 425 426 static int 427 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 428 { 429 struct vnode *vp; 430 struct nameidata nd; 431 char *buf; 432 unsigned long ndflags; 433 int error; 434 435 if (req->newptr == NULL) 436 return (EINVAL); 437 if (req->newlen >= PATH_MAX) 438 return (E2BIG); 439 440 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 441 error = SYSCTL_IN(req, buf, req->newlen); 442 if (error != 0) 443 goto out; 444 445 buf[req->newlen] = '\0'; 446 447 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; 448 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); 449 if ((error = namei(&nd)) != 0) 450 goto out; 451 vp = nd.ni_vp; 452 453 if (VN_IS_DOOMED(vp)) { 454 /* 455 * This vnode is being recycled. Return != 0 to let the caller 456 * know that the sysctl had no effect. Return EAGAIN because a 457 * subsequent call will likely succeed (since namei will create 458 * a new vnode if necessary) 459 */ 460 error = EAGAIN; 461 goto putvnode; 462 } 463 464 vgone(vp); 465 putvnode: 466 vput(vp); 467 NDFREE_PNBUF(&nd); 468 out: 469 free(buf, M_TEMP); 470 return (error); 471 } 472 473 static int 474 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 475 { 476 struct thread *td = curthread; 477 struct vnode *vp; 478 struct file *fp; 479 int error; 480 int fd; 481 482 if (req->newptr == NULL) 483 return (EBADF); 484 485 error = sysctl_handle_int(oidp, &fd, 0, req); 486 if (error != 0) 487 return (error); 488 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 489 if (error != 0) 490 return (error); 491 vp = fp->f_vnode; 492 493 error = vn_lock(vp, LK_EXCLUSIVE); 494 if (error != 0) 495 goto drop; 496 497 vgone(vp); 498 VOP_UNLOCK(vp); 499 drop: 500 fdrop(fp, td); 501 return (error); 502 } 503 504 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 505 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 506 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 507 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 508 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 509 sysctl_ftry_reclaim_vnode, "I", 510 "Try to reclaim a vnode by its file descriptor"); 511 512 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 513 #define vnsz2log 8 514 #ifndef DEBUG_LOCKS 515 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && 516 sizeof(struct vnode) < 1UL << (vnsz2log + 1), 517 "vnsz2log needs to be updated"); 518 #endif 519 520 /* 521 * Support for the bufobj clean & dirty pctrie. 522 */ 523 static void * 524 buf_trie_alloc(struct pctrie *ptree) 525 { 526 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 527 } 528 529 static void 530 buf_trie_free(struct pctrie *ptree, void *node) 531 { 532 uma_zfree_smr(buf_trie_zone, node); 533 } 534 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 535 buf_trie_smr); 536 537 /* 538 * Lookup the next element greater than or equal to lblkno, accounting for the 539 * fact that, for pctries, negative values are greater than nonnegative ones. 540 */ 541 static struct buf * 542 buf_lookup_ge(struct bufv *bv, daddr_t lblkno) 543 { 544 struct buf *bp; 545 546 bp = BUF_PCTRIE_LOOKUP_GE(&bv->bv_root, lblkno); 547 if (bp == NULL && lblkno < 0) 548 bp = BUF_PCTRIE_LOOKUP_GE(&bv->bv_root, 0); 549 if (bp != NULL && bp->b_lblkno < lblkno) 550 bp = NULL; 551 return (bp); 552 } 553 554 /* 555 * Insert bp, and find the next element smaller than bp, accounting for the fact 556 * that, for pctries, negative values are greater than nonnegative ones. 557 */ 558 static int 559 buf_insert_lookup_le(struct bufv *bv, struct buf *bp, struct buf **n) 560 { 561 int error; 562 563 error = BUF_PCTRIE_INSERT_LOOKUP_LE(&bv->bv_root, bp, n); 564 if (error != EEXIST) { 565 if (*n == NULL && bp->b_lblkno >= 0) 566 *n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, ~0L); 567 if (*n != NULL && (*n)->b_lblkno >= bp->b_lblkno) 568 *n = NULL; 569 } 570 return (error); 571 } 572 573 /* 574 * Initialize the vnode management data structures. 575 * 576 * Reevaluate the following cap on the number of vnodes after the physical 577 * memory size exceeds 512GB. In the limit, as the physical memory size 578 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 579 */ 580 #ifndef MAXVNODES_MAX 581 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 582 #endif 583 584 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 585 586 static struct vnode * 587 vn_alloc_marker(struct mount *mp) 588 { 589 struct vnode *vp; 590 591 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 592 vp->v_type = VMARKER; 593 vp->v_mount = mp; 594 595 return (vp); 596 } 597 598 static void 599 vn_free_marker(struct vnode *vp) 600 { 601 602 MPASS(vp->v_type == VMARKER); 603 free(vp, M_VNODE_MARKER); 604 } 605 606 #ifdef KASAN 607 static int 608 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) 609 { 610 kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); 611 return (0); 612 } 613 614 static void 615 vnode_dtor(void *mem, int size, void *arg __unused) 616 { 617 size_t end1, end2, off1, off2; 618 619 _Static_assert(offsetof(struct vnode, v_vnodelist) < 620 offsetof(struct vnode, v_dbatchcpu), 621 "KASAN marks require updating"); 622 623 off1 = offsetof(struct vnode, v_vnodelist); 624 off2 = offsetof(struct vnode, v_dbatchcpu); 625 end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); 626 end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); 627 628 /* 629 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even 630 * after the vnode has been freed. Try to get some KASAN coverage by 631 * marking everything except those two fields as invalid. Because 632 * KASAN's tracking is not byte-granular, any preceding fields sharing 633 * the same 8-byte aligned word must also be marked valid. 634 */ 635 636 /* Handle the area from the start until v_vnodelist... */ 637 off1 = rounddown2(off1, KASAN_SHADOW_SCALE); 638 kasan_mark(mem, off1, off1, KASAN_UMA_FREED); 639 640 /* ... then the area between v_vnodelist and v_dbatchcpu ... */ 641 off1 = roundup2(end1, KASAN_SHADOW_SCALE); 642 off2 = rounddown2(off2, KASAN_SHADOW_SCALE); 643 if (off2 > off1) 644 kasan_mark((void *)((char *)mem + off1), off2 - off1, 645 off2 - off1, KASAN_UMA_FREED); 646 647 /* ... and finally the area from v_dbatchcpu to the end. */ 648 off2 = roundup2(end2, KASAN_SHADOW_SCALE); 649 kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, 650 KASAN_UMA_FREED); 651 } 652 #endif /* KASAN */ 653 654 /* 655 * Initialize a vnode as it first enters the zone. 656 */ 657 static int 658 vnode_init(void *mem, int size, int flags) 659 { 660 struct vnode *vp; 661 662 vp = mem; 663 bzero(vp, size); 664 /* 665 * Setup locks. 666 */ 667 vp->v_vnlock = &vp->v_lock; 668 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 669 /* 670 * By default, don't allow shared locks unless filesystems opt-in. 671 */ 672 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 673 LK_NOSHARE | LK_IS_VNODE); 674 /* 675 * Initialize bufobj. 676 */ 677 bufobj_init(&vp->v_bufobj, vp); 678 /* 679 * Initialize namecache. 680 */ 681 cache_vnode_init(vp); 682 /* 683 * Initialize rangelocks. 684 */ 685 rangelock_init(&vp->v_rl); 686 687 vp->v_dbatchcpu = NOCPU; 688 689 vp->v_state = VSTATE_DEAD; 690 691 /* 692 * Check vhold_recycle_free for an explanation. 693 */ 694 vp->v_holdcnt = VHOLD_NO_SMR; 695 vp->v_type = VNON; 696 mtx_lock(&vnode_list_mtx); 697 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 698 mtx_unlock(&vnode_list_mtx); 699 return (0); 700 } 701 702 /* 703 * Free a vnode when it is cleared from the zone. 704 */ 705 static void 706 vnode_fini(void *mem, int size) 707 { 708 struct vnode *vp; 709 struct bufobj *bo; 710 711 vp = mem; 712 vdbatch_dequeue(vp); 713 mtx_lock(&vnode_list_mtx); 714 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 715 mtx_unlock(&vnode_list_mtx); 716 rangelock_destroy(&vp->v_rl); 717 lockdestroy(vp->v_vnlock); 718 mtx_destroy(&vp->v_interlock); 719 bo = &vp->v_bufobj; 720 rw_destroy(BO_LOCKPTR(bo)); 721 722 kasan_mark(mem, size, size, 0); 723 } 724 725 /* 726 * Provide the size of NFS nclnode and NFS fh for calculation of the 727 * vnode memory consumption. The size is specified directly to 728 * eliminate dependency on NFS-private header. 729 * 730 * Other filesystems may use bigger or smaller (like UFS and ZFS) 731 * private inode data, but the NFS-based estimation is ample enough. 732 * Still, we care about differences in the size between 64- and 32-bit 733 * platforms. 734 * 735 * Namecache structure size is heuristically 736 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 737 */ 738 #ifdef _LP64 739 #define NFS_NCLNODE_SZ (528 + 64) 740 #define NC_SZ 148 741 #else 742 #define NFS_NCLNODE_SZ (360 + 32) 743 #define NC_SZ 92 744 #endif 745 746 static void 747 vntblinit(void *dummy __unused) 748 { 749 struct vdbatch *vd; 750 uma_ctor ctor; 751 uma_dtor dtor; 752 int cpu, physvnodes, virtvnodes; 753 754 /* 755 * 'desiredvnodes' is the minimum of a function of the physical memory 756 * size and another of the kernel heap size (UMA limit, a portion of the 757 * KVA). 758 * 759 * Currently, on 64-bit platforms, 'desiredvnodes' is set to 760 * 'virtvnodes' up to a physical memory cutoff of ~1722MB, after which 761 * 'physvnodes' applies instead. With the current automatic tuning for 762 * 'maxfiles' (32 files/MB), 'desiredvnodes' is always greater than it. 763 */ 764 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 32 + 765 min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 32; 766 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 767 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 768 desiredvnodes = min(physvnodes, virtvnodes); 769 if (desiredvnodes > MAXVNODES_MAX) { 770 if (bootverbose) 771 printf("Reducing kern.maxvnodes %lu -> %lu\n", 772 desiredvnodes, MAXVNODES_MAX); 773 desiredvnodes = MAXVNODES_MAX; 774 } 775 wantfreevnodes = desiredvnodes / 4; 776 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 777 TAILQ_INIT(&vnode_list); 778 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 779 /* 780 * The lock is taken to appease WITNESS. 781 */ 782 mtx_lock(&vnode_list_mtx); 783 vnlru_recalc(); 784 mtx_unlock(&vnode_list_mtx); 785 vnode_list_free_marker = vn_alloc_marker(NULL); 786 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 787 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 788 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 789 790 #ifdef KASAN 791 ctor = vnode_ctor; 792 dtor = vnode_dtor; 793 #else 794 ctor = NULL; 795 dtor = NULL; 796 #endif 797 vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, 798 vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); 799 uma_zone_set_smr(vnode_zone, vfs_smr); 800 801 /* 802 * Preallocate enough nodes to support one-per buf so that 803 * we can not fail an insert. reassignbuf() callers can not 804 * tolerate the insertion failure. 805 */ 806 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 807 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 808 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 809 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 810 uma_prealloc(buf_trie_zone, nbuf); 811 812 vnodes_created = counter_u64_alloc(M_WAITOK); 813 direct_recycles_free_count = counter_u64_alloc(M_WAITOK); 814 vnode_skipped_requeues = counter_u64_alloc(M_WAITOK); 815 816 /* 817 * Initialize the filesystem syncer. 818 */ 819 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 820 &syncer_mask); 821 syncer_maxdelay = syncer_mask + 1; 822 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 823 cv_init(&sync_wakeup, "syncer"); 824 825 CPU_FOREACH(cpu) { 826 vd = DPCPU_ID_PTR((cpu), vd); 827 bzero(vd, sizeof(*vd)); 828 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 829 } 830 } 831 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 832 833 /* 834 * Mark a mount point as busy. Used to synchronize access and to delay 835 * unmounting. Eventually, mountlist_mtx is not released on failure. 836 * 837 * vfs_busy() is a custom lock, it can block the caller. 838 * vfs_busy() only sleeps if the unmount is active on the mount point. 839 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 840 * vnode belonging to mp. 841 * 842 * Lookup uses vfs_busy() to traverse mount points. 843 * root fs var fs 844 * / vnode lock A / vnode lock (/var) D 845 * /var vnode lock B /log vnode lock(/var/log) E 846 * vfs_busy lock C vfs_busy lock F 847 * 848 * Within each file system, the lock order is C->A->B and F->D->E. 849 * 850 * When traversing across mounts, the system follows that lock order: 851 * 852 * C->A->B 853 * | 854 * +->F->D->E 855 * 856 * The lookup() process for namei("/var") illustrates the process: 857 * 1. VOP_LOOKUP() obtains B while A is held 858 * 2. vfs_busy() obtains a shared lock on F while A and B are held 859 * 3. vput() releases lock on B 860 * 4. vput() releases lock on A 861 * 5. VFS_ROOT() obtains lock on D while shared lock on F is held 862 * 6. vfs_unbusy() releases shared lock on F 863 * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 864 * Attempt to lock A (instead of vp_crossmp) while D is held would 865 * violate the global order, causing deadlocks. 866 * 867 * dounmount() locks B while F is drained. Note that for stacked 868 * filesystems, D and B in the example above may be the same lock, 869 * which introdues potential lock order reversal deadlock between 870 * dounmount() and step 5 above. These filesystems may avoid the LOR 871 * by setting VV_CROSSLOCK on the covered vnode so that lock B will 872 * remain held until after step 5. 873 */ 874 int 875 vfs_busy(struct mount *mp, int flags) 876 { 877 struct mount_pcpu *mpcpu; 878 879 MPASS((flags & ~MBF_MASK) == 0); 880 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 881 882 if (vfs_op_thread_enter(mp, mpcpu)) { 883 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 884 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 885 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 886 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 887 vfs_mp_count_add_pcpu(mpcpu, lockref, 1); 888 vfs_op_thread_exit(mp, mpcpu); 889 if (flags & MBF_MNTLSTLOCK) 890 mtx_unlock(&mountlist_mtx); 891 return (0); 892 } 893 894 MNT_ILOCK(mp); 895 vfs_assert_mount_counters(mp); 896 MNT_REF(mp); 897 /* 898 * If mount point is currently being unmounted, sleep until the 899 * mount point fate is decided. If thread doing the unmounting fails, 900 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 901 * that this mount point has survived the unmount attempt and vfs_busy 902 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 903 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 904 * about to be really destroyed. vfs_busy needs to release its 905 * reference on the mount point in this case and return with ENOENT, 906 * telling the caller the mount it tried to busy is no longer valid. 907 */ 908 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 909 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), 910 ("%s: non-empty upper mount list with pending unmount", 911 __func__)); 912 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 913 MNT_REL(mp); 914 MNT_IUNLOCK(mp); 915 CTR1(KTR_VFS, "%s: failed busying before sleeping", 916 __func__); 917 return (ENOENT); 918 } 919 if (flags & MBF_MNTLSTLOCK) 920 mtx_unlock(&mountlist_mtx); 921 mp->mnt_kern_flag |= MNTK_MWAIT; 922 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 923 if (flags & MBF_MNTLSTLOCK) 924 mtx_lock(&mountlist_mtx); 925 MNT_ILOCK(mp); 926 } 927 if (flags & MBF_MNTLSTLOCK) 928 mtx_unlock(&mountlist_mtx); 929 mp->mnt_lockref++; 930 MNT_IUNLOCK(mp); 931 return (0); 932 } 933 934 /* 935 * Free a busy filesystem. 936 */ 937 void 938 vfs_unbusy(struct mount *mp) 939 { 940 struct mount_pcpu *mpcpu; 941 int c; 942 943 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 944 945 if (vfs_op_thread_enter(mp, mpcpu)) { 946 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 947 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); 948 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 949 vfs_op_thread_exit(mp, mpcpu); 950 return; 951 } 952 953 MNT_ILOCK(mp); 954 vfs_assert_mount_counters(mp); 955 MNT_REL(mp); 956 c = --mp->mnt_lockref; 957 if (mp->mnt_vfs_ops == 0) { 958 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 959 MNT_IUNLOCK(mp); 960 return; 961 } 962 if (c < 0) 963 vfs_dump_mount_counters(mp); 964 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 965 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 966 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 967 mp->mnt_kern_flag &= ~MNTK_DRAINING; 968 wakeup(&mp->mnt_lockref); 969 } 970 MNT_IUNLOCK(mp); 971 } 972 973 /* 974 * Lookup a mount point by filesystem identifier. 975 */ 976 struct mount * 977 vfs_getvfs(fsid_t *fsid) 978 { 979 struct mount *mp; 980 981 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 982 mtx_lock(&mountlist_mtx); 983 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 984 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 985 vfs_ref(mp); 986 mtx_unlock(&mountlist_mtx); 987 return (mp); 988 } 989 } 990 mtx_unlock(&mountlist_mtx); 991 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 992 return ((struct mount *) 0); 993 } 994 995 /* 996 * Lookup a mount point by filesystem identifier, busying it before 997 * returning. 998 * 999 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 1000 * cache for popular filesystem identifiers. The cache is lockess, using 1001 * the fact that struct mount's are never freed. In worst case we may 1002 * get pointer to unmounted or even different filesystem, so we have to 1003 * check what we got, and go slow way if so. 1004 */ 1005 struct mount * 1006 vfs_busyfs(fsid_t *fsid) 1007 { 1008 #define FSID_CACHE_SIZE 256 1009 typedef struct mount * volatile vmp_t; 1010 static vmp_t cache[FSID_CACHE_SIZE]; 1011 struct mount *mp; 1012 int error; 1013 uint32_t hash; 1014 1015 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 1016 hash = fsid->val[0] ^ fsid->val[1]; 1017 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 1018 mp = cache[hash]; 1019 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 1020 goto slow; 1021 if (vfs_busy(mp, 0) != 0) { 1022 cache[hash] = NULL; 1023 goto slow; 1024 } 1025 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 1026 return (mp); 1027 else 1028 vfs_unbusy(mp); 1029 1030 slow: 1031 mtx_lock(&mountlist_mtx); 1032 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 1033 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 1034 error = vfs_busy(mp, MBF_MNTLSTLOCK); 1035 if (error) { 1036 cache[hash] = NULL; 1037 mtx_unlock(&mountlist_mtx); 1038 return (NULL); 1039 } 1040 cache[hash] = mp; 1041 return (mp); 1042 } 1043 } 1044 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 1045 mtx_unlock(&mountlist_mtx); 1046 return ((struct mount *) 0); 1047 } 1048 1049 /* 1050 * Check if a user can access privileged mount options. 1051 */ 1052 int 1053 vfs_suser(struct mount *mp, struct thread *td) 1054 { 1055 int error; 1056 1057 if (jailed(td->td_ucred)) { 1058 /* 1059 * If the jail of the calling thread lacks permission for 1060 * this type of file system, deny immediately. 1061 */ 1062 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 1063 return (EPERM); 1064 1065 /* 1066 * If the file system was mounted outside the jail of the 1067 * calling thread, deny immediately. 1068 */ 1069 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 1070 return (EPERM); 1071 } 1072 1073 /* 1074 * If file system supports delegated administration, we don't check 1075 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 1076 * by the file system itself. 1077 * If this is not the user that did original mount, we check for 1078 * the PRIV_VFS_MOUNT_OWNER privilege. 1079 */ 1080 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 1081 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 1082 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 1083 return (error); 1084 } 1085 return (0); 1086 } 1087 1088 /* 1089 * Get a new unique fsid. Try to make its val[0] unique, since this value 1090 * will be used to create fake device numbers for stat(). Also try (but 1091 * not so hard) make its val[0] unique mod 2^16, since some emulators only 1092 * support 16-bit device numbers. We end up with unique val[0]'s for the 1093 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 1094 * 1095 * Keep in mind that several mounts may be running in parallel. Starting 1096 * the search one past where the previous search terminated is both a 1097 * micro-optimization and a defense against returning the same fsid to 1098 * different mounts. 1099 */ 1100 void 1101 vfs_getnewfsid(struct mount *mp) 1102 { 1103 static uint16_t mntid_base; 1104 struct mount *nmp; 1105 fsid_t tfsid; 1106 int mtype; 1107 1108 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 1109 mtx_lock(&mntid_mtx); 1110 mtype = mp->mnt_vfc->vfc_typenum; 1111 tfsid.val[1] = mtype; 1112 mtype = (mtype & 0xFF) << 24; 1113 for (;;) { 1114 tfsid.val[0] = makedev(255, 1115 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 1116 mntid_base++; 1117 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 1118 break; 1119 vfs_rel(nmp); 1120 } 1121 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 1122 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 1123 mtx_unlock(&mntid_mtx); 1124 } 1125 1126 /* 1127 * Knob to control the precision of file timestamps: 1128 * 1129 * 0 = seconds only; nanoseconds zeroed. 1130 * 1 = seconds and nanoseconds, accurate within 1/HZ. 1131 * 2 = seconds and nanoseconds, truncated to microseconds. 1132 * >=3 = seconds and nanoseconds, maximum precision. 1133 */ 1134 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 1135 1136 static int timestamp_precision = TSP_USEC; 1137 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 1138 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 1139 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 1140 "3+: sec + ns (max. precision))"); 1141 1142 /* 1143 * Get a current timestamp. 1144 */ 1145 void 1146 vfs_timestamp(struct timespec *tsp) 1147 { 1148 struct timeval tv; 1149 1150 switch (timestamp_precision) { 1151 case TSP_SEC: 1152 tsp->tv_sec = time_second; 1153 tsp->tv_nsec = 0; 1154 break; 1155 case TSP_HZ: 1156 getnanotime(tsp); 1157 break; 1158 case TSP_USEC: 1159 microtime(&tv); 1160 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1161 break; 1162 case TSP_NSEC: 1163 default: 1164 nanotime(tsp); 1165 break; 1166 } 1167 } 1168 1169 /* 1170 * Set vnode attributes to VNOVAL 1171 */ 1172 void 1173 vattr_null(struct vattr *vap) 1174 { 1175 1176 vap->va_type = VNON; 1177 vap->va_size = VNOVAL; 1178 vap->va_bytes = VNOVAL; 1179 vap->va_mode = VNOVAL; 1180 vap->va_nlink = VNOVAL; 1181 vap->va_uid = VNOVAL; 1182 vap->va_gid = VNOVAL; 1183 vap->va_fsid = VNOVAL; 1184 vap->va_fileid = VNOVAL; 1185 vap->va_blocksize = VNOVAL; 1186 vap->va_rdev = VNOVAL; 1187 vap->va_atime.tv_sec = VNOVAL; 1188 vap->va_atime.tv_nsec = VNOVAL; 1189 vap->va_mtime.tv_sec = VNOVAL; 1190 vap->va_mtime.tv_nsec = VNOVAL; 1191 vap->va_ctime.tv_sec = VNOVAL; 1192 vap->va_ctime.tv_nsec = VNOVAL; 1193 vap->va_birthtime.tv_sec = VNOVAL; 1194 vap->va_birthtime.tv_nsec = VNOVAL; 1195 vap->va_flags = VNOVAL; 1196 vap->va_gen = VNOVAL; 1197 vap->va_vaflags = 0; 1198 vap->va_filerev = VNOVAL; 1199 vap->va_bsdflags = 0; 1200 } 1201 1202 /* 1203 * Try to reduce the total number of vnodes. 1204 * 1205 * This routine (and its user) are buggy in at least the following ways: 1206 * - all parameters were picked years ago when RAM sizes were significantly 1207 * smaller 1208 * - it can pick vnodes based on pages used by the vm object, but filesystems 1209 * like ZFS don't use it making the pick broken 1210 * - since ZFS has its own aging policy it gets partially combated by this one 1211 * - a dedicated method should be provided for filesystems to let them decide 1212 * whether the vnode should be recycled 1213 * 1214 * This routine is called when we have too many vnodes. It attempts 1215 * to free <count> vnodes and will potentially free vnodes that still 1216 * have VM backing store (VM backing store is typically the cause 1217 * of a vnode blowout so we want to do this). Therefore, this operation 1218 * is not considered cheap. 1219 * 1220 * A number of conditions may prevent a vnode from being reclaimed. 1221 * the buffer cache may have references on the vnode, a directory 1222 * vnode may still have references due to the namei cache representing 1223 * underlying files, or the vnode may be in active use. It is not 1224 * desirable to reuse such vnodes. These conditions may cause the 1225 * number of vnodes to reach some minimum value regardless of what 1226 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1227 * 1228 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1229 * entries if this argument is strue 1230 * @param trigger Only reclaim vnodes with fewer than this many resident 1231 * pages. 1232 * @param target How many vnodes to reclaim. 1233 * @return The number of vnodes that were reclaimed. 1234 */ 1235 static int 1236 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1237 { 1238 struct vnode *vp, *mvp; 1239 struct mount *mp; 1240 struct vm_object *object; 1241 u_long done; 1242 bool retried; 1243 1244 mtx_assert(&vnode_list_mtx, MA_OWNED); 1245 1246 retried = false; 1247 done = 0; 1248 1249 mvp = vnode_list_reclaim_marker; 1250 restart: 1251 vp = mvp; 1252 while (done < target) { 1253 vp = TAILQ_NEXT(vp, v_vnodelist); 1254 if (__predict_false(vp == NULL)) 1255 break; 1256 1257 if (__predict_false(vp->v_type == VMARKER)) 1258 continue; 1259 1260 /* 1261 * If it's been deconstructed already, it's still 1262 * referenced, or it exceeds the trigger, skip it. 1263 * Also skip free vnodes. We are trying to make space 1264 * for more free vnodes, not reduce their count. 1265 */ 1266 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1267 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1268 goto next_iter; 1269 1270 if (vp->v_type == VBAD || vp->v_type == VNON) 1271 goto next_iter; 1272 1273 object = atomic_load_ptr(&vp->v_object); 1274 if (object == NULL || object->resident_page_count > trigger) { 1275 goto next_iter; 1276 } 1277 1278 /* 1279 * Handle races against vnode allocation. Filesystems lock the 1280 * vnode some time after it gets returned from getnewvnode, 1281 * despite type and hold count being manipulated earlier. 1282 * Resorting to checking v_mount restores guarantees present 1283 * before the global list was reworked to contain all vnodes. 1284 */ 1285 if (!VI_TRYLOCK(vp)) 1286 goto next_iter; 1287 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1288 VI_UNLOCK(vp); 1289 goto next_iter; 1290 } 1291 if (vp->v_mount == NULL) { 1292 VI_UNLOCK(vp); 1293 goto next_iter; 1294 } 1295 vholdl(vp); 1296 VI_UNLOCK(vp); 1297 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1298 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1299 mtx_unlock(&vnode_list_mtx); 1300 1301 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1302 vdrop_recycle(vp); 1303 goto next_iter_unlocked; 1304 } 1305 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1306 vdrop_recycle(vp); 1307 vn_finished_write(mp); 1308 goto next_iter_unlocked; 1309 } 1310 1311 VI_LOCK(vp); 1312 if (vp->v_usecount > 0 || 1313 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1314 (vp->v_object != NULL && vp->v_object->handle == vp && 1315 vp->v_object->resident_page_count > trigger)) { 1316 VOP_UNLOCK(vp); 1317 vdropl_recycle(vp); 1318 vn_finished_write(mp); 1319 goto next_iter_unlocked; 1320 } 1321 recycles_count++; 1322 vgonel(vp); 1323 VOP_UNLOCK(vp); 1324 vdropl_recycle(vp); 1325 vn_finished_write(mp); 1326 done++; 1327 next_iter_unlocked: 1328 maybe_yield(); 1329 mtx_lock(&vnode_list_mtx); 1330 goto restart; 1331 next_iter: 1332 MPASS(vp->v_type != VMARKER); 1333 if (!should_yield()) 1334 continue; 1335 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1336 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1337 mtx_unlock(&vnode_list_mtx); 1338 kern_yield(PRI_USER); 1339 mtx_lock(&vnode_list_mtx); 1340 goto restart; 1341 } 1342 if (done == 0 && !retried) { 1343 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1344 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1345 retried = true; 1346 goto restart; 1347 } 1348 return (done); 1349 } 1350 1351 static int max_free_per_call = 10000; 1352 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_free_per_call, 0, 1353 "limit on vnode free requests per call to the vnlru_free routine (legacy)"); 1354 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, max_free_per_call, CTLFLAG_RW, 1355 &max_free_per_call, 0, 1356 "limit on vnode free requests per call to the vnlru_free routine"); 1357 1358 /* 1359 * Attempt to recycle requested amount of free vnodes. 1360 */ 1361 static int 1362 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp, bool isvnlru) 1363 { 1364 struct vnode *vp; 1365 struct mount *mp; 1366 int ocount; 1367 bool retried; 1368 1369 mtx_assert(&vnode_list_mtx, MA_OWNED); 1370 if (count > max_free_per_call) 1371 count = max_free_per_call; 1372 if (count == 0) { 1373 mtx_unlock(&vnode_list_mtx); 1374 return (0); 1375 } 1376 ocount = count; 1377 retried = false; 1378 vp = mvp; 1379 for (;;) { 1380 vp = TAILQ_NEXT(vp, v_vnodelist); 1381 if (__predict_false(vp == NULL)) { 1382 /* 1383 * The free vnode marker can be past eligible vnodes: 1384 * 1. if vdbatch_process trylock failed 1385 * 2. if vtryrecycle failed 1386 * 1387 * If so, start the scan from scratch. 1388 */ 1389 if (!retried && vnlru_read_freevnodes() > 0) { 1390 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1391 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1392 vp = mvp; 1393 retried = true; 1394 continue; 1395 } 1396 1397 /* 1398 * Give up 1399 */ 1400 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1401 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1402 mtx_unlock(&vnode_list_mtx); 1403 break; 1404 } 1405 if (__predict_false(vp->v_type == VMARKER)) 1406 continue; 1407 if (vp->v_holdcnt > 0) 1408 continue; 1409 /* 1410 * Don't recycle if our vnode is from different type 1411 * of mount point. Note that mp is type-safe, the 1412 * check does not reach unmapped address even if 1413 * vnode is reclaimed. 1414 */ 1415 if (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1416 mp->mnt_op != mnt_op) { 1417 continue; 1418 } 1419 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1420 continue; 1421 } 1422 if (!vhold_recycle_free(vp)) 1423 continue; 1424 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1425 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1426 mtx_unlock(&vnode_list_mtx); 1427 /* 1428 * FIXME: ignores the return value, meaning it may be nothing 1429 * got recycled but it claims otherwise to the caller. 1430 * 1431 * Originally the value started being ignored in 2005 with 1432 * 114a1006a8204aa156e1f9ad6476cdff89cada7f . 1433 * 1434 * Respecting the value can run into significant stalls if most 1435 * vnodes belong to one file system and it has writes 1436 * suspended. In presence of many threads and millions of 1437 * vnodes they keep contending on the vnode_list_mtx lock only 1438 * to find vnodes they can't recycle. 1439 * 1440 * The solution would be to pre-check if the vnode is likely to 1441 * be recycle-able, but it needs to happen with the 1442 * vnode_list_mtx lock held. This runs into a problem where 1443 * VOP_GETWRITEMOUNT (currently needed to find out about if 1444 * writes are frozen) can take locks which LOR against it. 1445 * 1446 * Check nullfs for one example (null_getwritemount). 1447 */ 1448 vtryrecycle(vp, isvnlru); 1449 count--; 1450 if (count == 0) { 1451 break; 1452 } 1453 mtx_lock(&vnode_list_mtx); 1454 vp = mvp; 1455 } 1456 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1457 return (ocount - count); 1458 } 1459 1460 /* 1461 * XXX: returns without vnode_list_mtx locked! 1462 */ 1463 static int 1464 vnlru_free_locked_direct(int count) 1465 { 1466 int ret; 1467 1468 mtx_assert(&vnode_list_mtx, MA_OWNED); 1469 ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, false); 1470 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1471 return (ret); 1472 } 1473 1474 static int 1475 vnlru_free_locked_vnlru(int count) 1476 { 1477 int ret; 1478 1479 mtx_assert(&vnode_list_mtx, MA_OWNED); 1480 ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, true); 1481 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1482 return (ret); 1483 } 1484 1485 static int 1486 vnlru_free_vnlru(int count) 1487 { 1488 1489 mtx_lock(&vnode_list_mtx); 1490 return (vnlru_free_locked_vnlru(count)); 1491 } 1492 1493 void 1494 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) 1495 { 1496 1497 MPASS(mnt_op != NULL); 1498 MPASS(mvp != NULL); 1499 VNPASS(mvp->v_type == VMARKER, mvp); 1500 mtx_lock(&vnode_list_mtx); 1501 vnlru_free_impl(count, mnt_op, mvp, true); 1502 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1503 } 1504 1505 struct vnode * 1506 vnlru_alloc_marker(void) 1507 { 1508 struct vnode *mvp; 1509 1510 mvp = vn_alloc_marker(NULL); 1511 mtx_lock(&vnode_list_mtx); 1512 TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); 1513 mtx_unlock(&vnode_list_mtx); 1514 return (mvp); 1515 } 1516 1517 void 1518 vnlru_free_marker(struct vnode *mvp) 1519 { 1520 mtx_lock(&vnode_list_mtx); 1521 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1522 mtx_unlock(&vnode_list_mtx); 1523 vn_free_marker(mvp); 1524 } 1525 1526 static void 1527 vnlru_recalc(void) 1528 { 1529 1530 mtx_assert(&vnode_list_mtx, MA_OWNED); 1531 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1532 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1533 vlowat = vhiwat / 2; 1534 } 1535 1536 /* 1537 * Attempt to recycle vnodes in a context that is always safe to block. 1538 * Calling vlrurecycle() from the bowels of filesystem code has some 1539 * interesting deadlock problems. 1540 */ 1541 static struct proc *vnlruproc; 1542 static int vnlruproc_sig; 1543 static u_long vnlruproc_kicks; 1544 1545 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, kicks, CTLFLAG_RD, &vnlruproc_kicks, 0, 1546 "Number of times vnlru awakened due to vnode shortage"); 1547 1548 #define VNLRU_COUNT_SLOP 100 1549 1550 /* 1551 * The main freevnodes counter is only updated when a counter local to CPU 1552 * diverges from 0 by more than VNLRU_FREEVNODES_SLOP. CPUs are conditionally 1553 * walked to compute a more accurate total. 1554 * 1555 * Note: the actual value at any given moment can still exceed slop, but it 1556 * should not be by significant margin in practice. 1557 */ 1558 #define VNLRU_FREEVNODES_SLOP 126 1559 1560 static void __noinline 1561 vfs_freevnodes_rollup(int8_t *lfreevnodes) 1562 { 1563 1564 atomic_add_long(&freevnodes, *lfreevnodes); 1565 *lfreevnodes = 0; 1566 critical_exit(); 1567 } 1568 1569 static __inline void 1570 vfs_freevnodes_inc(void) 1571 { 1572 int8_t *lfreevnodes; 1573 1574 critical_enter(); 1575 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1576 (*lfreevnodes)++; 1577 if (__predict_false(*lfreevnodes == VNLRU_FREEVNODES_SLOP)) 1578 vfs_freevnodes_rollup(lfreevnodes); 1579 else 1580 critical_exit(); 1581 } 1582 1583 static __inline void 1584 vfs_freevnodes_dec(void) 1585 { 1586 int8_t *lfreevnodes; 1587 1588 critical_enter(); 1589 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1590 (*lfreevnodes)--; 1591 if (__predict_false(*lfreevnodes == -VNLRU_FREEVNODES_SLOP)) 1592 vfs_freevnodes_rollup(lfreevnodes); 1593 else 1594 critical_exit(); 1595 } 1596 1597 static u_long 1598 vnlru_read_freevnodes(void) 1599 { 1600 long slop, rfreevnodes, rfreevnodes_old; 1601 int cpu; 1602 1603 rfreevnodes = atomic_load_long(&freevnodes); 1604 rfreevnodes_old = atomic_load_long(&freevnodes_old); 1605 1606 if (rfreevnodes > rfreevnodes_old) 1607 slop = rfreevnodes - rfreevnodes_old; 1608 else 1609 slop = rfreevnodes_old - rfreevnodes; 1610 if (slop < VNLRU_FREEVNODES_SLOP) 1611 return (rfreevnodes >= 0 ? rfreevnodes : 0); 1612 CPU_FOREACH(cpu) { 1613 rfreevnodes += cpuid_to_pcpu[cpu]->pc_vfs_freevnodes; 1614 } 1615 atomic_store_long(&freevnodes_old, rfreevnodes); 1616 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1617 } 1618 1619 static bool 1620 vnlru_under(u_long rnumvnodes, u_long limit) 1621 { 1622 u_long rfreevnodes, space; 1623 1624 if (__predict_false(rnumvnodes > desiredvnodes)) 1625 return (true); 1626 1627 space = desiredvnodes - rnumvnodes; 1628 if (space < limit) { 1629 rfreevnodes = vnlru_read_freevnodes(); 1630 if (rfreevnodes > wantfreevnodes) 1631 space += rfreevnodes - wantfreevnodes; 1632 } 1633 return (space < limit); 1634 } 1635 1636 static void 1637 vnlru_kick_locked(void) 1638 { 1639 1640 mtx_assert(&vnode_list_mtx, MA_OWNED); 1641 if (vnlruproc_sig == 0) { 1642 vnlruproc_sig = 1; 1643 vnlruproc_kicks++; 1644 wakeup(vnlruproc); 1645 } 1646 } 1647 1648 static void 1649 vnlru_kick_cond(void) 1650 { 1651 1652 if (vnlru_read_freevnodes() > wantfreevnodes) 1653 return; 1654 1655 if (vnlruproc_sig) 1656 return; 1657 mtx_lock(&vnode_list_mtx); 1658 vnlru_kick_locked(); 1659 mtx_unlock(&vnode_list_mtx); 1660 } 1661 1662 static void 1663 vnlru_proc_sleep(void) 1664 { 1665 1666 if (vnlruproc_sig) { 1667 vnlruproc_sig = 0; 1668 wakeup(&vnlruproc_sig); 1669 } 1670 msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); 1671 } 1672 1673 /* 1674 * A lighter version of the machinery below. 1675 * 1676 * Tries to reach goals only by recycling free vnodes and does not invoke 1677 * uma_reclaim(UMA_RECLAIM_DRAIN). 1678 * 1679 * This works around pathological behavior in vnlru in presence of tons of free 1680 * vnodes, but without having to rewrite the machinery at this time. Said 1681 * behavior boils down to continuously trying to reclaim all kinds of vnodes 1682 * (cycling through all levels of "force") when the count is transiently above 1683 * limit. This happens a lot when all vnodes are used up and vn_alloc 1684 * speculatively increments the counter. 1685 * 1686 * Sample testcase: vnode limit 8388608, 20 separate directory trees each with 1687 * 1 million files in total and 20 find(1) processes stating them in parallel 1688 * (one per each tree). 1689 * 1690 * On a kernel with only stock machinery this needs anywhere between 60 and 120 1691 * seconds to execute (time varies *wildly* between runs). With the workaround 1692 * it consistently stays around 20 seconds [it got further down with later 1693 * changes]. 1694 * 1695 * That is to say the entire thing needs a fundamental redesign (most notably 1696 * to accommodate faster recycling), the above only tries to get it ouf the way. 1697 * 1698 * Return values are: 1699 * -1 -- fallback to regular vnlru loop 1700 * 0 -- do nothing, go to sleep 1701 * >0 -- recycle this many vnodes 1702 */ 1703 static long 1704 vnlru_proc_light_pick(void) 1705 { 1706 u_long rnumvnodes, rfreevnodes; 1707 1708 if (vstir || vnlruproc_sig == 1) 1709 return (-1); 1710 1711 rnumvnodes = atomic_load_long(&numvnodes); 1712 rfreevnodes = vnlru_read_freevnodes(); 1713 1714 /* 1715 * vnode limit might have changed and now we may be at a significant 1716 * excess. Bail if we can't sort it out with free vnodes. 1717 * 1718 * Due to atomic updates the count can legitimately go above 1719 * the limit for a short period, don't bother doing anything in 1720 * that case. 1721 */ 1722 if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP + 10) { 1723 if (rnumvnodes - rfreevnodes >= desiredvnodes || 1724 rfreevnodes <= wantfreevnodes) { 1725 return (-1); 1726 } 1727 1728 return (rnumvnodes - desiredvnodes); 1729 } 1730 1731 /* 1732 * Don't try to reach wantfreevnodes target if there are too few vnodes 1733 * to begin with. 1734 */ 1735 if (rnumvnodes < wantfreevnodes) { 1736 return (0); 1737 } 1738 1739 if (rfreevnodes < wantfreevnodes) { 1740 return (-1); 1741 } 1742 1743 return (0); 1744 } 1745 1746 static bool 1747 vnlru_proc_light(void) 1748 { 1749 long freecount; 1750 1751 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1752 1753 freecount = vnlru_proc_light_pick(); 1754 if (freecount == -1) 1755 return (false); 1756 1757 if (freecount != 0) { 1758 vnlru_free_vnlru(freecount); 1759 } 1760 1761 mtx_lock(&vnode_list_mtx); 1762 vnlru_proc_sleep(); 1763 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1764 return (true); 1765 } 1766 1767 static u_long uma_reclaim_calls; 1768 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, uma_reclaim_calls, CTLFLAG_RD | CTLFLAG_STATS, 1769 &uma_reclaim_calls, 0, "Number of calls to uma_reclaim"); 1770 1771 static void 1772 vnlru_proc(void) 1773 { 1774 u_long rnumvnodes, rfreevnodes, target; 1775 unsigned long onumvnodes; 1776 int done, force, trigger, usevnodes; 1777 bool reclaim_nc_src, want_reread; 1778 1779 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1780 SHUTDOWN_PRI_FIRST); 1781 1782 force = 0; 1783 want_reread = false; 1784 for (;;) { 1785 kproc_suspend_check(vnlruproc); 1786 1787 if (force == 0 && vnlru_proc_light()) 1788 continue; 1789 1790 mtx_lock(&vnode_list_mtx); 1791 rnumvnodes = atomic_load_long(&numvnodes); 1792 1793 if (want_reread) { 1794 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1795 want_reread = false; 1796 } 1797 1798 /* 1799 * If numvnodes is too large (due to desiredvnodes being 1800 * adjusted using its sysctl, or emergency growth), first 1801 * try to reduce it by discarding free vnodes. 1802 */ 1803 if (rnumvnodes > desiredvnodes + 10) { 1804 vnlru_free_locked_vnlru(rnumvnodes - desiredvnodes); 1805 mtx_lock(&vnode_list_mtx); 1806 rnumvnodes = atomic_load_long(&numvnodes); 1807 } 1808 /* 1809 * Sleep if the vnode cache is in a good state. This is 1810 * when it is not over-full and has space for about a 4% 1811 * or 9% expansion (by growing its size or inexcessively 1812 * reducing free vnode count). Otherwise, try to reclaim 1813 * space for a 10% expansion. 1814 */ 1815 if (vstir && force == 0) { 1816 force = 1; 1817 vstir = false; 1818 } 1819 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1820 vnlru_proc_sleep(); 1821 continue; 1822 } 1823 rfreevnodes = vnlru_read_freevnodes(); 1824 1825 onumvnodes = rnumvnodes; 1826 /* 1827 * Calculate parameters for recycling. These are the same 1828 * throughout the loop to give some semblance of fairness. 1829 * The trigger point is to avoid recycling vnodes with lots 1830 * of resident pages. We aren't trying to free memory; we 1831 * are trying to recycle or at least free vnodes. 1832 */ 1833 if (rnumvnodes <= desiredvnodes) 1834 usevnodes = rnumvnodes - rfreevnodes; 1835 else 1836 usevnodes = rnumvnodes; 1837 if (usevnodes <= 0) 1838 usevnodes = 1; 1839 /* 1840 * The trigger value is chosen to give a conservatively 1841 * large value to ensure that it alone doesn't prevent 1842 * making progress. The value can easily be so large that 1843 * it is effectively infinite in some congested and 1844 * misconfigured cases, and this is necessary. Normally 1845 * it is about 8 to 100 (pages), which is quite large. 1846 */ 1847 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1848 if (force < 2) 1849 trigger = vsmalltrigger; 1850 reclaim_nc_src = force >= 3; 1851 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1852 target = target / 10 + 1; 1853 done = vlrureclaim(reclaim_nc_src, trigger, target); 1854 mtx_unlock(&vnode_list_mtx); 1855 /* 1856 * Total number of vnodes can transiently go slightly above the 1857 * limit (see vn_alloc_hard), no need to call uma_reclaim if 1858 * this happens. 1859 */ 1860 if (onumvnodes + VNLRU_COUNT_SLOP + 1000 > desiredvnodes && 1861 numvnodes <= desiredvnodes) { 1862 uma_reclaim_calls++; 1863 uma_reclaim(UMA_RECLAIM_DRAIN); 1864 } 1865 if (done == 0) { 1866 if (force == 0 || force == 1) { 1867 force = 2; 1868 continue; 1869 } 1870 if (force == 2) { 1871 force = 3; 1872 continue; 1873 } 1874 want_reread = true; 1875 force = 0; 1876 vnlru_nowhere++; 1877 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1878 } else { 1879 want_reread = true; 1880 kern_yield(PRI_USER); 1881 } 1882 } 1883 } 1884 1885 static struct kproc_desc vnlru_kp = { 1886 "vnlru", 1887 vnlru_proc, 1888 &vnlruproc 1889 }; 1890 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1891 &vnlru_kp); 1892 1893 /* 1894 * Routines having to do with the management of the vnode table. 1895 */ 1896 1897 /* 1898 * Try to recycle a freed vnode. 1899 */ 1900 static int 1901 vtryrecycle(struct vnode *vp, bool isvnlru) 1902 { 1903 struct mount *vnmp; 1904 1905 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1906 VNPASS(vp->v_holdcnt > 0, vp); 1907 /* 1908 * This vnode may found and locked via some other list, if so we 1909 * can't recycle it yet. 1910 */ 1911 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1912 CTR2(KTR_VFS, 1913 "%s: impossible to recycle, vp %p lock is already held", 1914 __func__, vp); 1915 vdrop_recycle(vp); 1916 return (EWOULDBLOCK); 1917 } 1918 /* 1919 * Don't recycle if its filesystem is being suspended. 1920 */ 1921 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1922 VOP_UNLOCK(vp); 1923 CTR2(KTR_VFS, 1924 "%s: impossible to recycle, cannot start the write for %p", 1925 __func__, vp); 1926 vdrop_recycle(vp); 1927 return (EBUSY); 1928 } 1929 /* 1930 * If we got this far, we need to acquire the interlock and see if 1931 * anyone picked up this vnode from another list. If not, we will 1932 * mark it with DOOMED via vgonel() so that anyone who does find it 1933 * will skip over it. 1934 */ 1935 VI_LOCK(vp); 1936 if (vp->v_usecount) { 1937 VOP_UNLOCK(vp); 1938 vdropl_recycle(vp); 1939 vn_finished_write(vnmp); 1940 CTR2(KTR_VFS, 1941 "%s: impossible to recycle, %p is already referenced", 1942 __func__, vp); 1943 return (EBUSY); 1944 } 1945 if (!VN_IS_DOOMED(vp)) { 1946 if (isvnlru) 1947 recycles_free_count++; 1948 else 1949 counter_u64_add(direct_recycles_free_count, 1); 1950 vgonel(vp); 1951 } 1952 VOP_UNLOCK(vp); 1953 vdropl_recycle(vp); 1954 vn_finished_write(vnmp); 1955 return (0); 1956 } 1957 1958 /* 1959 * Allocate a new vnode. 1960 * 1961 * The operation never returns an error. Returning an error was disabled 1962 * in r145385 (dated 2005) with the following comment: 1963 * 1964 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1965 * 1966 * Given the age of this commit (almost 15 years at the time of writing this 1967 * comment) restoring the ability to fail requires a significant audit of 1968 * all codepaths. 1969 * 1970 * The routine can try to free a vnode or stall for up to 1 second waiting for 1971 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1972 */ 1973 static u_long vn_alloc_cyclecount; 1974 static u_long vn_alloc_sleeps; 1975 1976 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, alloc_sleeps, CTLFLAG_RD, &vn_alloc_sleeps, 0, 1977 "Number of times vnode allocation blocked waiting on vnlru"); 1978 1979 static struct vnode * __noinline 1980 vn_alloc_hard(struct mount *mp, u_long rnumvnodes, bool bumped) 1981 { 1982 u_long rfreevnodes; 1983 1984 if (bumped) { 1985 if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP) { 1986 atomic_subtract_long(&numvnodes, 1); 1987 bumped = false; 1988 } 1989 } 1990 1991 mtx_lock(&vnode_list_mtx); 1992 1993 /* 1994 * Reload 'numvnodes', as since we acquired the lock, it may have 1995 * changed significantly if we waited, and 'rnumvnodes' above was only 1996 * actually passed if 'bumped' is true (else it is 0). 1997 */ 1998 rnumvnodes = atomic_load_long(&numvnodes); 1999 if (rnumvnodes + !bumped < desiredvnodes) { 2000 vn_alloc_cyclecount = 0; 2001 mtx_unlock(&vnode_list_mtx); 2002 goto alloc; 2003 } 2004 2005 rfreevnodes = vnlru_read_freevnodes(); 2006 if (vn_alloc_cyclecount++ >= rfreevnodes) { 2007 vn_alloc_cyclecount = 0; 2008 vstir = true; 2009 } 2010 2011 /* 2012 * Grow the vnode cache if it will not be above its target max after 2013 * growing. Otherwise, if there is at least one free vnode, try to 2014 * reclaim 1 item from it before growing the cache (possibly above its 2015 * target max if the reclamation failed or is delayed). 2016 */ 2017 if (vnlru_free_locked_direct(1) > 0) 2018 goto alloc; 2019 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 2020 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2021 /* 2022 * Wait for space for a new vnode. 2023 */ 2024 if (bumped) { 2025 atomic_subtract_long(&numvnodes, 1); 2026 bumped = false; 2027 } 2028 mtx_lock(&vnode_list_mtx); 2029 vnlru_kick_locked(); 2030 vn_alloc_sleeps++; 2031 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 2032 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 2033 vnlru_read_freevnodes() > 1) 2034 vnlru_free_locked_direct(1); 2035 else 2036 mtx_unlock(&vnode_list_mtx); 2037 } 2038 alloc: 2039 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 2040 if (!bumped) 2041 atomic_add_long(&numvnodes, 1); 2042 vnlru_kick_cond(); 2043 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 2044 } 2045 2046 static struct vnode * 2047 vn_alloc(struct mount *mp) 2048 { 2049 u_long rnumvnodes; 2050 2051 if (__predict_false(vn_alloc_cyclecount != 0)) 2052 return (vn_alloc_hard(mp, 0, false)); 2053 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 2054 if (__predict_false(vnlru_under(rnumvnodes, vlowat))) { 2055 return (vn_alloc_hard(mp, rnumvnodes, true)); 2056 } 2057 2058 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 2059 } 2060 2061 static void 2062 vn_free(struct vnode *vp) 2063 { 2064 2065 atomic_subtract_long(&numvnodes, 1); 2066 uma_zfree_smr(vnode_zone, vp); 2067 } 2068 2069 /* 2070 * Allocate a new vnode. 2071 */ 2072 int 2073 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 2074 struct vnode **vpp) 2075 { 2076 struct vnode *vp; 2077 struct thread *td; 2078 struct lock_object *lo; 2079 2080 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 2081 2082 KASSERT(vops->registered, 2083 ("%s: not registered vector op %p\n", __func__, vops)); 2084 cache_validate_vop_vector(mp, vops); 2085 2086 td = curthread; 2087 if (td->td_vp_reserved != NULL) { 2088 vp = td->td_vp_reserved; 2089 td->td_vp_reserved = NULL; 2090 } else { 2091 vp = vn_alloc(mp); 2092 } 2093 counter_u64_add(vnodes_created, 1); 2094 2095 vn_set_state(vp, VSTATE_UNINITIALIZED); 2096 2097 /* 2098 * Locks are given the generic name "vnode" when created. 2099 * Follow the historic practice of using the filesystem 2100 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 2101 * 2102 * Locks live in a witness group keyed on their name. Thus, 2103 * when a lock is renamed, it must also move from the witness 2104 * group of its old name to the witness group of its new name. 2105 * 2106 * The change only needs to be made when the vnode moves 2107 * from one filesystem type to another. We ensure that each 2108 * filesystem use a single static name pointer for its tag so 2109 * that we can compare pointers rather than doing a strcmp(). 2110 */ 2111 lo = &vp->v_vnlock->lock_object; 2112 #ifdef WITNESS 2113 if (lo->lo_name != tag) { 2114 #endif 2115 lo->lo_name = tag; 2116 #ifdef WITNESS 2117 WITNESS_DESTROY(lo); 2118 WITNESS_INIT(lo, tag); 2119 } 2120 #endif 2121 /* 2122 * By default, don't allow shared locks unless filesystems opt-in. 2123 */ 2124 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 2125 /* 2126 * Finalize various vnode identity bits. 2127 */ 2128 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 2129 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 2130 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 2131 vp->v_type = VNON; 2132 vp->v_op = vops; 2133 vp->v_irflag = 0; 2134 v_init_counters(vp); 2135 vn_seqc_init(vp); 2136 vp->v_bufobj.bo_ops = &buf_ops_bio; 2137 #ifdef DIAGNOSTIC 2138 if (mp == NULL && vops != &dead_vnodeops) 2139 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 2140 #endif 2141 #ifdef MAC 2142 mac_vnode_init(vp); 2143 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 2144 mac_vnode_associate_singlelabel(mp, vp); 2145 #endif 2146 if (mp != NULL) { 2147 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 2148 } 2149 2150 /* 2151 * For the filesystems which do not use vfs_hash_insert(), 2152 * still initialize v_hash to have vfs_hash_index() useful. 2153 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 2154 * its own hashing. 2155 */ 2156 vp->v_hash = (uintptr_t)vp >> vnsz2log; 2157 2158 *vpp = vp; 2159 return (0); 2160 } 2161 2162 void 2163 getnewvnode_reserve(void) 2164 { 2165 struct thread *td; 2166 2167 td = curthread; 2168 MPASS(td->td_vp_reserved == NULL); 2169 td->td_vp_reserved = vn_alloc(NULL); 2170 } 2171 2172 void 2173 getnewvnode_drop_reserve(void) 2174 { 2175 struct thread *td; 2176 2177 td = curthread; 2178 if (td->td_vp_reserved != NULL) { 2179 vn_free(td->td_vp_reserved); 2180 td->td_vp_reserved = NULL; 2181 } 2182 } 2183 2184 static void __noinline 2185 freevnode(struct vnode *vp) 2186 { 2187 struct bufobj *bo; 2188 2189 ASSERT_VOP_UNLOCKED(vp, __func__); 2190 2191 /* 2192 * The vnode has been marked for destruction, so free it. 2193 * 2194 * The vnode will be returned to the zone where it will 2195 * normally remain until it is needed for another vnode. We 2196 * need to cleanup (or verify that the cleanup has already 2197 * been done) any residual data left from its current use 2198 * so as not to contaminate the freshly allocated vnode. 2199 */ 2200 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2201 /* 2202 * Paired with vgone. 2203 */ 2204 vn_seqc_write_end_free(vp); 2205 2206 bo = &vp->v_bufobj; 2207 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2208 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 2209 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2210 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2211 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2212 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2213 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 2214 ("clean blk trie not empty")); 2215 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2216 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 2217 ("dirty blk trie not empty")); 2218 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, 2219 ("Leaked inactivation")); 2220 VI_UNLOCK(vp); 2221 cache_assert_no_entries(vp); 2222 2223 #ifdef MAC 2224 mac_vnode_destroy(vp); 2225 #endif 2226 if (vp->v_pollinfo != NULL) { 2227 int error __diagused; 2228 2229 /* 2230 * Use LK_NOWAIT to shut up witness about the lock. We may get 2231 * here while having another vnode locked when trying to 2232 * satisfy a lookup and needing to recycle. 2233 */ 2234 error = VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); 2235 VNASSERT(error == 0, vp, 2236 ("freevnode: cannot lock vp %p for pollinfo destroy", vp)); 2237 destroy_vpollinfo(vp->v_pollinfo); 2238 VOP_UNLOCK(vp); 2239 vp->v_pollinfo = NULL; 2240 } 2241 vp->v_mountedhere = NULL; 2242 vp->v_unpcb = NULL; 2243 vp->v_rdev = NULL; 2244 vp->v_fifoinfo = NULL; 2245 vp->v_iflag = 0; 2246 vp->v_vflag = 0; 2247 bo->bo_flag = 0; 2248 vn_free(vp); 2249 } 2250 2251 /* 2252 * Delete from old mount point vnode list, if on one. 2253 */ 2254 static void 2255 delmntque(struct vnode *vp) 2256 { 2257 struct mount *mp; 2258 2259 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 2260 2261 mp = vp->v_mount; 2262 MNT_ILOCK(mp); 2263 VI_LOCK(vp); 2264 vp->v_mount = NULL; 2265 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 2266 ("bad mount point vnode list size")); 2267 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2268 mp->mnt_nvnodelistsize--; 2269 MNT_REL(mp); 2270 MNT_IUNLOCK(mp); 2271 /* 2272 * The caller expects the interlock to be still held. 2273 */ 2274 ASSERT_VI_LOCKED(vp, __func__); 2275 } 2276 2277 static int 2278 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) 2279 { 2280 2281 KASSERT(vp->v_mount == NULL, 2282 ("insmntque: vnode already on per mount vnode list")); 2283 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 2284 if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { 2285 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 2286 } else { 2287 KASSERT(!dtr, 2288 ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", 2289 __func__)); 2290 } 2291 2292 /* 2293 * We acquire the vnode interlock early to ensure that the 2294 * vnode cannot be recycled by another process releasing a 2295 * holdcnt on it before we get it on both the vnode list 2296 * and the active vnode list. The mount mutex protects only 2297 * manipulation of the vnode list and the vnode freelist 2298 * mutex protects only manipulation of the active vnode list. 2299 * Hence the need to hold the vnode interlock throughout. 2300 */ 2301 MNT_ILOCK(mp); 2302 VI_LOCK(vp); 2303 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 2304 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 2305 mp->mnt_nvnodelistsize == 0)) && 2306 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 2307 VI_UNLOCK(vp); 2308 MNT_IUNLOCK(mp); 2309 if (dtr) { 2310 vp->v_data = NULL; 2311 vp->v_op = &dead_vnodeops; 2312 vgone(vp); 2313 vput(vp); 2314 } 2315 return (EBUSY); 2316 } 2317 vp->v_mount = mp; 2318 MNT_REF(mp); 2319 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2320 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 2321 ("neg mount point vnode list size")); 2322 mp->mnt_nvnodelistsize++; 2323 VI_UNLOCK(vp); 2324 MNT_IUNLOCK(mp); 2325 return (0); 2326 } 2327 2328 /* 2329 * Insert into list of vnodes for the new mount point, if available. 2330 * insmntque() reclaims the vnode on insertion failure, insmntque1() 2331 * leaves handling of the vnode to the caller. 2332 */ 2333 int 2334 insmntque(struct vnode *vp, struct mount *mp) 2335 { 2336 return (insmntque1_int(vp, mp, true)); 2337 } 2338 2339 int 2340 insmntque1(struct vnode *vp, struct mount *mp) 2341 { 2342 return (insmntque1_int(vp, mp, false)); 2343 } 2344 2345 /* 2346 * Flush out and invalidate all buffers associated with a bufobj 2347 * Called with the underlying object locked. 2348 */ 2349 int 2350 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 2351 { 2352 int error; 2353 2354 BO_LOCK(bo); 2355 if (flags & V_SAVE) { 2356 error = bufobj_wwait(bo, slpflag, slptimeo); 2357 if (error) { 2358 BO_UNLOCK(bo); 2359 return (error); 2360 } 2361 if (bo->bo_dirty.bv_cnt > 0) { 2362 BO_UNLOCK(bo); 2363 do { 2364 error = BO_SYNC(bo, MNT_WAIT); 2365 } while (error == ERELOOKUP); 2366 if (error != 0) 2367 return (error); 2368 BO_LOCK(bo); 2369 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { 2370 BO_UNLOCK(bo); 2371 return (EBUSY); 2372 } 2373 } 2374 } 2375 /* 2376 * If you alter this loop please notice that interlock is dropped and 2377 * reacquired in flushbuflist. Special care is needed to ensure that 2378 * no race conditions occur from this. 2379 */ 2380 do { 2381 error = flushbuflist(&bo->bo_clean, 2382 flags, bo, slpflag, slptimeo); 2383 if (error == 0 && !(flags & V_CLEANONLY)) 2384 error = flushbuflist(&bo->bo_dirty, 2385 flags, bo, slpflag, slptimeo); 2386 if (error != 0 && error != EAGAIN) { 2387 BO_UNLOCK(bo); 2388 return (error); 2389 } 2390 } while (error != 0); 2391 2392 /* 2393 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 2394 * have write I/O in-progress but if there is a VM object then the 2395 * VM object can also have read-I/O in-progress. 2396 */ 2397 do { 2398 bufobj_wwait(bo, 0, 0); 2399 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 2400 BO_UNLOCK(bo); 2401 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 2402 BO_LOCK(bo); 2403 } 2404 } while (bo->bo_numoutput > 0); 2405 BO_UNLOCK(bo); 2406 2407 /* 2408 * Destroy the copy in the VM cache, too. 2409 */ 2410 if (bo->bo_object != NULL && 2411 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 2412 VM_OBJECT_WLOCK(bo->bo_object); 2413 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 2414 OBJPR_CLEANONLY : 0); 2415 VM_OBJECT_WUNLOCK(bo->bo_object); 2416 } 2417 2418 #ifdef INVARIANTS 2419 BO_LOCK(bo); 2420 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 2421 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 2422 bo->bo_clean.bv_cnt > 0)) 2423 panic("vinvalbuf: flush failed"); 2424 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 2425 bo->bo_dirty.bv_cnt > 0) 2426 panic("vinvalbuf: flush dirty failed"); 2427 BO_UNLOCK(bo); 2428 #endif 2429 return (0); 2430 } 2431 2432 /* 2433 * Flush out and invalidate all buffers associated with a vnode. 2434 * Called with the underlying object locked. 2435 */ 2436 int 2437 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 2438 { 2439 2440 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2441 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 2442 if (vp->v_object != NULL && vp->v_object->handle != vp) 2443 return (0); 2444 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 2445 } 2446 2447 /* 2448 * Flush out buffers on the specified list. 2449 * 2450 */ 2451 static int 2452 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 2453 int slptimeo) 2454 { 2455 struct buf *bp, *nbp; 2456 int retval, error; 2457 daddr_t lblkno; 2458 b_xflags_t xflags; 2459 2460 ASSERT_BO_WLOCKED(bo); 2461 2462 retval = 0; 2463 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2464 /* 2465 * If we are flushing both V_NORMAL and V_ALT buffers then 2466 * do not skip any buffers. If we are flushing only V_NORMAL 2467 * buffers then skip buffers marked as BX_ALTDATA. If we are 2468 * flushing only V_ALT buffers then skip buffers not marked 2469 * as BX_ALTDATA. 2470 */ 2471 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2472 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2473 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2474 continue; 2475 } 2476 if (nbp != NULL) { 2477 lblkno = nbp->b_lblkno; 2478 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2479 } 2480 retval = EAGAIN; 2481 error = BUF_TIMELOCK(bp, 2482 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2483 "flushbuf", slpflag, slptimeo); 2484 if (error) { 2485 BO_LOCK(bo); 2486 return (error != ENOLCK ? error : EAGAIN); 2487 } 2488 KASSERT(bp->b_bufobj == bo, 2489 ("bp %p wrong b_bufobj %p should be %p", 2490 bp, bp->b_bufobj, bo)); 2491 /* 2492 * XXX Since there are no node locks for NFS, I 2493 * believe there is a slight chance that a delayed 2494 * write will occur while sleeping just above, so 2495 * check for it. 2496 */ 2497 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2498 (flags & V_SAVE)) { 2499 bremfree(bp); 2500 bp->b_flags |= B_ASYNC; 2501 bwrite(bp); 2502 BO_LOCK(bo); 2503 return (EAGAIN); /* XXX: why not loop ? */ 2504 } 2505 bremfree(bp); 2506 bp->b_flags |= (B_INVAL | B_RELBUF); 2507 bp->b_flags &= ~B_ASYNC; 2508 brelse(bp); 2509 BO_LOCK(bo); 2510 if (nbp == NULL) 2511 break; 2512 nbp = gbincore(bo, lblkno); 2513 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2514 != xflags) 2515 break; /* nbp invalid */ 2516 } 2517 return (retval); 2518 } 2519 2520 int 2521 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2522 { 2523 struct buf *bp; 2524 int error; 2525 daddr_t lblkno; 2526 2527 ASSERT_BO_LOCKED(bo); 2528 2529 for (lblkno = startn;;) { 2530 again: 2531 bp = buf_lookup_ge(bufv, lblkno); 2532 if (bp == NULL || bp->b_lblkno >= endn) 2533 break; 2534 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2535 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2536 if (error != 0) { 2537 BO_RLOCK(bo); 2538 if (error == ENOLCK) 2539 goto again; 2540 return (error); 2541 } 2542 KASSERT(bp->b_bufobj == bo, 2543 ("bp %p wrong b_bufobj %p should be %p", 2544 bp, bp->b_bufobj, bo)); 2545 lblkno = bp->b_lblkno + 1; 2546 if ((bp->b_flags & B_MANAGED) == 0) 2547 bremfree(bp); 2548 bp->b_flags |= B_RELBUF; 2549 /* 2550 * In the VMIO case, use the B_NOREUSE flag to hint that the 2551 * pages backing each buffer in the range are unlikely to be 2552 * reused. Dirty buffers will have the hint applied once 2553 * they've been written. 2554 */ 2555 if ((bp->b_flags & B_VMIO) != 0) 2556 bp->b_flags |= B_NOREUSE; 2557 brelse(bp); 2558 BO_RLOCK(bo); 2559 } 2560 return (0); 2561 } 2562 2563 /* 2564 * Truncate a file's buffer and pages to a specified length. This 2565 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2566 * sync activity. 2567 */ 2568 int 2569 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2570 { 2571 struct buf *bp, *nbp; 2572 struct bufobj *bo; 2573 daddr_t startlbn; 2574 2575 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2576 vp, blksize, (uintmax_t)length); 2577 2578 /* 2579 * Round up to the *next* lbn. 2580 */ 2581 startlbn = howmany(length, blksize); 2582 2583 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2584 2585 bo = &vp->v_bufobj; 2586 restart_unlocked: 2587 BO_LOCK(bo); 2588 2589 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2590 ; 2591 2592 if (length > 0) { 2593 /* 2594 * Write out vnode metadata, e.g. indirect blocks. 2595 */ 2596 restartsync: 2597 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2598 if (bp->b_lblkno >= 0) 2599 continue; 2600 /* 2601 * Since we hold the vnode lock this should only 2602 * fail if we're racing with the buf daemon. 2603 */ 2604 if (BUF_LOCK(bp, 2605 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2606 BO_LOCKPTR(bo)) == ENOLCK) 2607 goto restart_unlocked; 2608 2609 VNASSERT((bp->b_flags & B_DELWRI), vp, 2610 ("buf(%p) on dirty queue without DELWRI", bp)); 2611 2612 bremfree(bp); 2613 bawrite(bp); 2614 BO_LOCK(bo); 2615 goto restartsync; 2616 } 2617 } 2618 2619 bufobj_wwait(bo, 0, 0); 2620 BO_UNLOCK(bo); 2621 vnode_pager_setsize(vp, length); 2622 2623 return (0); 2624 } 2625 2626 /* 2627 * Invalidate the cached pages of a file's buffer within the range of block 2628 * numbers [startlbn, endlbn). 2629 */ 2630 void 2631 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2632 int blksize) 2633 { 2634 struct bufobj *bo; 2635 off_t start, end; 2636 2637 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2638 2639 start = blksize * startlbn; 2640 end = blksize * endlbn; 2641 2642 bo = &vp->v_bufobj; 2643 BO_LOCK(bo); 2644 MPASS(blksize == bo->bo_bsize); 2645 2646 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2647 ; 2648 2649 BO_UNLOCK(bo); 2650 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2651 } 2652 2653 static int 2654 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2655 daddr_t startlbn, daddr_t endlbn) 2656 { 2657 struct bufv *bv; 2658 struct buf *bp, *nbp; 2659 uint8_t anyfreed; 2660 bool clean; 2661 2662 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2663 ASSERT_BO_LOCKED(bo); 2664 2665 anyfreed = 1; 2666 clean = true; 2667 do { 2668 bv = clean ? &bo->bo_clean : &bo->bo_dirty; 2669 bp = buf_lookup_ge(bv, startlbn); 2670 if (bp == NULL) 2671 continue; 2672 TAILQ_FOREACH_FROM_SAFE(bp, &bv->bv_hd, b_bobufs, nbp) { 2673 if (bp->b_lblkno >= endlbn) 2674 break; 2675 if (BUF_LOCK(bp, 2676 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2677 BO_LOCKPTR(bo)) == ENOLCK) { 2678 BO_LOCK(bo); 2679 return (EAGAIN); 2680 } 2681 2682 bremfree(bp); 2683 bp->b_flags |= B_INVAL | B_RELBUF; 2684 bp->b_flags &= ~B_ASYNC; 2685 brelse(bp); 2686 anyfreed = 2; 2687 2688 BO_LOCK(bo); 2689 if (nbp != NULL && 2690 (((nbp->b_xflags & 2691 (clean ? BX_VNCLEAN : BX_VNDIRTY)) == 0) || 2692 nbp->b_vp != vp || 2693 (nbp->b_flags & B_DELWRI) == (clean? B_DELWRI: 0))) 2694 return (EAGAIN); 2695 } 2696 } while (clean = !clean, anyfreed-- > 0); 2697 return (0); 2698 } 2699 2700 static void 2701 buf_vlist_remove(struct buf *bp) 2702 { 2703 struct bufv *bv; 2704 b_xflags_t flags; 2705 2706 flags = bp->b_xflags; 2707 2708 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2709 ASSERT_BO_WLOCKED(bp->b_bufobj); 2710 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2711 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2712 ("%s: buffer %p has invalid queue state", __func__, bp)); 2713 2714 if ((flags & BX_VNDIRTY) != 0) 2715 bv = &bp->b_bufobj->bo_dirty; 2716 else 2717 bv = &bp->b_bufobj->bo_clean; 2718 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2719 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2720 bv->bv_cnt--; 2721 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2722 } 2723 2724 /* 2725 * Add the buffer to the sorted clean or dirty block list. Return zero on 2726 * success, EEXIST if a buffer with this identity already exists, or another 2727 * error on allocation failure. 2728 */ 2729 static inline int 2730 buf_vlist_find_or_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2731 { 2732 struct bufv *bv; 2733 struct buf *n; 2734 int error; 2735 2736 ASSERT_BO_WLOCKED(bo); 2737 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2738 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2739 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2740 ("dead bo %p", bo)); 2741 KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == xflags, 2742 ("buf_vlist_add: b_xflags %#x not set on bp %p", xflags, bp)); 2743 2744 if (xflags & BX_VNDIRTY) 2745 bv = &bo->bo_dirty; 2746 else 2747 bv = &bo->bo_clean; 2748 2749 error = buf_insert_lookup_le(bv, bp, &n); 2750 if (n == NULL) { 2751 KASSERT(error != EEXIST, 2752 ("buf_vlist_add: EEXIST but no existing buf found: bp %p", 2753 bp)); 2754 } else { 2755 KASSERT(n->b_lblkno <= bp->b_lblkno, 2756 ("buf_vlist_add: out of order insert/lookup: bp %p n %p", 2757 bp, n)); 2758 KASSERT((n->b_lblkno == bp->b_lblkno) == (error == EEXIST), 2759 ("buf_vlist_add: inconsistent result for existing buf: " 2760 "error %d bp %p n %p", error, bp, n)); 2761 } 2762 if (error != 0) 2763 return (error); 2764 2765 /* Keep the list ordered. */ 2766 if (n == NULL) { 2767 KASSERT(TAILQ_EMPTY(&bv->bv_hd) || 2768 bp->b_lblkno < TAILQ_FIRST(&bv->bv_hd)->b_lblkno, 2769 ("buf_vlist_add: queue order: " 2770 "%p should be before first %p", 2771 bp, TAILQ_FIRST(&bv->bv_hd))); 2772 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2773 } else { 2774 KASSERT(TAILQ_NEXT(n, b_bobufs) == NULL || 2775 bp->b_lblkno < TAILQ_NEXT(n, b_bobufs)->b_lblkno, 2776 ("buf_vlist_add: queue order: " 2777 "%p should be before next %p", 2778 bp, TAILQ_NEXT(n, b_bobufs))); 2779 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2780 } 2781 2782 bv->bv_cnt++; 2783 return (0); 2784 } 2785 2786 /* 2787 * Add the buffer to the sorted clean or dirty block list. 2788 * 2789 * NOTE: xflags is passed as a constant, optimizing this inline function! 2790 */ 2791 static void 2792 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2793 { 2794 int error; 2795 2796 KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0, 2797 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2798 bp->b_xflags |= xflags; 2799 error = buf_vlist_find_or_add(bp, bo, xflags); 2800 if (error) 2801 panic("buf_vlist_add: error=%d", error); 2802 } 2803 2804 /* 2805 * Look up a buffer using the buffer tries. 2806 */ 2807 struct buf * 2808 gbincore(struct bufobj *bo, daddr_t lblkno) 2809 { 2810 struct buf *bp; 2811 2812 ASSERT_BO_LOCKED(bo); 2813 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2814 if (bp != NULL) 2815 return (bp); 2816 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2817 } 2818 2819 /* 2820 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2821 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2822 * stability of the result. Like other lockless lookups, the found buf may 2823 * already be invalid by the time this function returns. 2824 */ 2825 struct buf * 2826 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2827 { 2828 struct buf *bp; 2829 2830 ASSERT_BO_UNLOCKED(bo); 2831 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2832 if (bp != NULL) 2833 return (bp); 2834 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2835 } 2836 2837 /* 2838 * Associate a buffer with a vnode. 2839 */ 2840 int 2841 bgetvp(struct vnode *vp, struct buf *bp) 2842 { 2843 struct bufobj *bo; 2844 int error; 2845 2846 bo = &vp->v_bufobj; 2847 ASSERT_BO_UNLOCKED(bo); 2848 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2849 2850 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2851 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2852 ("bgetvp: bp already attached! %p", bp)); 2853 2854 /* 2855 * Add the buf to the vnode's clean list unless we lost a race and find 2856 * an existing buf in either dirty or clean. 2857 */ 2858 bp->b_vp = vp; 2859 bp->b_bufobj = bo; 2860 bp->b_xflags |= BX_VNCLEAN; 2861 error = EEXIST; 2862 BO_LOCK(bo); 2863 if (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, bp->b_lblkno) == NULL) 2864 error = buf_vlist_find_or_add(bp, bo, BX_VNCLEAN); 2865 BO_UNLOCK(bo); 2866 if (__predict_true(error == 0)) { 2867 vhold(vp); 2868 return (0); 2869 } 2870 if (error != EEXIST) 2871 panic("bgetvp: buf_vlist_add error: %d", error); 2872 bp->b_vp = NULL; 2873 bp->b_bufobj = NULL; 2874 bp->b_xflags &= ~BX_VNCLEAN; 2875 return (error); 2876 } 2877 2878 /* 2879 * Disassociate a buffer from a vnode. 2880 */ 2881 void 2882 brelvp(struct buf *bp) 2883 { 2884 struct bufobj *bo; 2885 struct vnode *vp; 2886 2887 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2888 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2889 2890 /* 2891 * Delete from old vnode list, if on one. 2892 */ 2893 vp = bp->b_vp; /* XXX */ 2894 bo = bp->b_bufobj; 2895 BO_LOCK(bo); 2896 buf_vlist_remove(bp); 2897 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2898 bo->bo_flag &= ~BO_ONWORKLST; 2899 mtx_lock(&sync_mtx); 2900 LIST_REMOVE(bo, bo_synclist); 2901 syncer_worklist_len--; 2902 mtx_unlock(&sync_mtx); 2903 } 2904 bp->b_vp = NULL; 2905 bp->b_bufobj = NULL; 2906 BO_UNLOCK(bo); 2907 vdrop(vp); 2908 } 2909 2910 /* 2911 * Add an item to the syncer work queue. 2912 */ 2913 static void 2914 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2915 { 2916 int slot; 2917 2918 ASSERT_BO_WLOCKED(bo); 2919 2920 mtx_lock(&sync_mtx); 2921 if (bo->bo_flag & BO_ONWORKLST) 2922 LIST_REMOVE(bo, bo_synclist); 2923 else { 2924 bo->bo_flag |= BO_ONWORKLST; 2925 syncer_worklist_len++; 2926 } 2927 2928 if (delay > syncer_maxdelay - 2) 2929 delay = syncer_maxdelay - 2; 2930 slot = (syncer_delayno + delay) & syncer_mask; 2931 2932 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2933 mtx_unlock(&sync_mtx); 2934 } 2935 2936 static int 2937 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2938 { 2939 int error, len; 2940 2941 mtx_lock(&sync_mtx); 2942 len = syncer_worklist_len - sync_vnode_count; 2943 mtx_unlock(&sync_mtx); 2944 error = SYSCTL_OUT(req, &len, sizeof(len)); 2945 return (error); 2946 } 2947 2948 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2949 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2950 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2951 2952 static struct proc *updateproc; 2953 static void sched_sync(void); 2954 static struct kproc_desc up_kp = { 2955 "syncer", 2956 sched_sync, 2957 &updateproc 2958 }; 2959 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2960 2961 static int 2962 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2963 { 2964 struct vnode *vp; 2965 struct mount *mp; 2966 2967 *bo = LIST_FIRST(slp); 2968 if (*bo == NULL) 2969 return (0); 2970 vp = bo2vnode(*bo); 2971 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2972 return (1); 2973 /* 2974 * We use vhold in case the vnode does not 2975 * successfully sync. vhold prevents the vnode from 2976 * going away when we unlock the sync_mtx so that 2977 * we can acquire the vnode interlock. 2978 */ 2979 vholdl(vp); 2980 mtx_unlock(&sync_mtx); 2981 VI_UNLOCK(vp); 2982 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2983 vdrop(vp); 2984 mtx_lock(&sync_mtx); 2985 return (*bo == LIST_FIRST(slp)); 2986 } 2987 MPASSERT(mp == NULL || (curthread->td_pflags & TDP_IGNSUSP) != 0 || 2988 (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0, mp, 2989 ("suspended mp syncing vp %p", vp)); 2990 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2991 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2992 VOP_UNLOCK(vp); 2993 vn_finished_write(mp); 2994 BO_LOCK(*bo); 2995 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2996 /* 2997 * Put us back on the worklist. The worklist 2998 * routine will remove us from our current 2999 * position and then add us back in at a later 3000 * position. 3001 */ 3002 vn_syncer_add_to_worklist(*bo, syncdelay); 3003 } 3004 BO_UNLOCK(*bo); 3005 vdrop(vp); 3006 mtx_lock(&sync_mtx); 3007 return (0); 3008 } 3009 3010 static int first_printf = 1; 3011 3012 /* 3013 * System filesystem synchronizer daemon. 3014 */ 3015 static void 3016 sched_sync(void) 3017 { 3018 struct synclist *next, *slp; 3019 struct bufobj *bo; 3020 long starttime; 3021 struct thread *td = curthread; 3022 int last_work_seen; 3023 int net_worklist_len; 3024 int syncer_final_iter; 3025 int error; 3026 3027 last_work_seen = 0; 3028 syncer_final_iter = 0; 3029 syncer_state = SYNCER_RUNNING; 3030 starttime = time_uptime; 3031 td->td_pflags |= TDP_NORUNNINGBUF; 3032 3033 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 3034 SHUTDOWN_PRI_LAST); 3035 3036 mtx_lock(&sync_mtx); 3037 for (;;) { 3038 if (syncer_state == SYNCER_FINAL_DELAY && 3039 syncer_final_iter == 0) { 3040 mtx_unlock(&sync_mtx); 3041 kproc_suspend_check(td->td_proc); 3042 mtx_lock(&sync_mtx); 3043 } 3044 net_worklist_len = syncer_worklist_len - sync_vnode_count; 3045 if (syncer_state != SYNCER_RUNNING && 3046 starttime != time_uptime) { 3047 if (first_printf) { 3048 printf("\nSyncing disks, vnodes remaining... "); 3049 first_printf = 0; 3050 } 3051 printf("%d ", net_worklist_len); 3052 } 3053 starttime = time_uptime; 3054 3055 /* 3056 * Push files whose dirty time has expired. Be careful 3057 * of interrupt race on slp queue. 3058 * 3059 * Skip over empty worklist slots when shutting down. 3060 */ 3061 do { 3062 slp = &syncer_workitem_pending[syncer_delayno]; 3063 syncer_delayno += 1; 3064 if (syncer_delayno == syncer_maxdelay) 3065 syncer_delayno = 0; 3066 next = &syncer_workitem_pending[syncer_delayno]; 3067 /* 3068 * If the worklist has wrapped since the 3069 * it was emptied of all but syncer vnodes, 3070 * switch to the FINAL_DELAY state and run 3071 * for one more second. 3072 */ 3073 if (syncer_state == SYNCER_SHUTTING_DOWN && 3074 net_worklist_len == 0 && 3075 last_work_seen == syncer_delayno) { 3076 syncer_state = SYNCER_FINAL_DELAY; 3077 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 3078 } 3079 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 3080 syncer_worklist_len > 0); 3081 3082 /* 3083 * Keep track of the last time there was anything 3084 * on the worklist other than syncer vnodes. 3085 * Return to the SHUTTING_DOWN state if any 3086 * new work appears. 3087 */ 3088 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 3089 last_work_seen = syncer_delayno; 3090 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 3091 syncer_state = SYNCER_SHUTTING_DOWN; 3092 while (!LIST_EMPTY(slp)) { 3093 error = sync_vnode(slp, &bo, td); 3094 if (error == 1) { 3095 LIST_REMOVE(bo, bo_synclist); 3096 LIST_INSERT_HEAD(next, bo, bo_synclist); 3097 continue; 3098 } 3099 3100 if (first_printf == 0) { 3101 /* 3102 * Drop the sync mutex, because some watchdog 3103 * drivers need to sleep while patting 3104 */ 3105 mtx_unlock(&sync_mtx); 3106 wdog_kern_pat(WD_LASTVAL); 3107 mtx_lock(&sync_mtx); 3108 } 3109 } 3110 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 3111 syncer_final_iter--; 3112 /* 3113 * The variable rushjob allows the kernel to speed up the 3114 * processing of the filesystem syncer process. A rushjob 3115 * value of N tells the filesystem syncer to process the next 3116 * N seconds worth of work on its queue ASAP. Currently rushjob 3117 * is used by the soft update code to speed up the filesystem 3118 * syncer process when the incore state is getting so far 3119 * ahead of the disk that the kernel memory pool is being 3120 * threatened with exhaustion. 3121 */ 3122 if (rushjob > 0) { 3123 rushjob -= 1; 3124 continue; 3125 } 3126 /* 3127 * Just sleep for a short period of time between 3128 * iterations when shutting down to allow some I/O 3129 * to happen. 3130 * 3131 * If it has taken us less than a second to process the 3132 * current work, then wait. Otherwise start right over 3133 * again. We can still lose time if any single round 3134 * takes more than two seconds, but it does not really 3135 * matter as we are just trying to generally pace the 3136 * filesystem activity. 3137 */ 3138 if (syncer_state != SYNCER_RUNNING || 3139 time_uptime == starttime) { 3140 thread_lock(td); 3141 sched_prio(td, PPAUSE); 3142 thread_unlock(td); 3143 } 3144 if (syncer_state != SYNCER_RUNNING) 3145 cv_timedwait(&sync_wakeup, &sync_mtx, 3146 hz / SYNCER_SHUTDOWN_SPEEDUP); 3147 else if (time_uptime == starttime) 3148 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 3149 } 3150 } 3151 3152 /* 3153 * Request the syncer daemon to speed up its work. 3154 * We never push it to speed up more than half of its 3155 * normal turn time, otherwise it could take over the cpu. 3156 */ 3157 int 3158 speedup_syncer(void) 3159 { 3160 int ret = 0; 3161 3162 mtx_lock(&sync_mtx); 3163 if (rushjob < syncdelay / 2) { 3164 rushjob += 1; 3165 stat_rush_requests += 1; 3166 ret = 1; 3167 } 3168 mtx_unlock(&sync_mtx); 3169 cv_broadcast(&sync_wakeup); 3170 return (ret); 3171 } 3172 3173 /* 3174 * Tell the syncer to speed up its work and run though its work 3175 * list several times, then tell it to shut down. 3176 */ 3177 static void 3178 syncer_shutdown(void *arg, int howto) 3179 { 3180 3181 if (howto & RB_NOSYNC) 3182 return; 3183 mtx_lock(&sync_mtx); 3184 syncer_state = SYNCER_SHUTTING_DOWN; 3185 rushjob = 0; 3186 mtx_unlock(&sync_mtx); 3187 cv_broadcast(&sync_wakeup); 3188 kproc_shutdown(arg, howto); 3189 } 3190 3191 void 3192 syncer_suspend(void) 3193 { 3194 3195 syncer_shutdown(updateproc, 0); 3196 } 3197 3198 void 3199 syncer_resume(void) 3200 { 3201 3202 mtx_lock(&sync_mtx); 3203 first_printf = 1; 3204 syncer_state = SYNCER_RUNNING; 3205 mtx_unlock(&sync_mtx); 3206 cv_broadcast(&sync_wakeup); 3207 kproc_resume(updateproc); 3208 } 3209 3210 /* 3211 * Move the buffer between the clean and dirty lists of its vnode. 3212 */ 3213 void 3214 reassignbuf(struct buf *bp) 3215 { 3216 struct vnode *vp; 3217 struct bufobj *bo; 3218 int delay; 3219 #ifdef INVARIANTS 3220 struct bufv *bv; 3221 #endif 3222 3223 vp = bp->b_vp; 3224 bo = bp->b_bufobj; 3225 3226 KASSERT((bp->b_flags & B_PAGING) == 0, 3227 ("%s: cannot reassign paging buffer %p", __func__, bp)); 3228 3229 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 3230 bp, bp->b_vp, bp->b_flags); 3231 3232 BO_LOCK(bo); 3233 if ((bo->bo_flag & BO_NONSTERILE) == 0) { 3234 /* 3235 * Coordinate with getblk's unlocked lookup. Make 3236 * BO_NONSTERILE visible before the first reassignbuf produces 3237 * any side effect. This could be outside the bo lock if we 3238 * used a separate atomic flag field. 3239 */ 3240 bo->bo_flag |= BO_NONSTERILE; 3241 atomic_thread_fence_rel(); 3242 } 3243 buf_vlist_remove(bp); 3244 3245 /* 3246 * If dirty, put on list of dirty buffers; otherwise insert onto list 3247 * of clean buffers. 3248 */ 3249 if (bp->b_flags & B_DELWRI) { 3250 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 3251 switch (vp->v_type) { 3252 case VDIR: 3253 delay = dirdelay; 3254 break; 3255 case VCHR: 3256 delay = metadelay; 3257 break; 3258 default: 3259 delay = filedelay; 3260 } 3261 vn_syncer_add_to_worklist(bo, delay); 3262 } 3263 buf_vlist_add(bp, bo, BX_VNDIRTY); 3264 } else { 3265 buf_vlist_add(bp, bo, BX_VNCLEAN); 3266 3267 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 3268 mtx_lock(&sync_mtx); 3269 LIST_REMOVE(bo, bo_synclist); 3270 syncer_worklist_len--; 3271 mtx_unlock(&sync_mtx); 3272 bo->bo_flag &= ~BO_ONWORKLST; 3273 } 3274 } 3275 #ifdef INVARIANTS 3276 bv = &bo->bo_clean; 3277 bp = TAILQ_FIRST(&bv->bv_hd); 3278 KASSERT(bp == NULL || bp->b_bufobj == bo, 3279 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3280 bp = TAILQ_LAST(&bv->bv_hd, buflists); 3281 KASSERT(bp == NULL || bp->b_bufobj == bo, 3282 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3283 bv = &bo->bo_dirty; 3284 bp = TAILQ_FIRST(&bv->bv_hd); 3285 KASSERT(bp == NULL || bp->b_bufobj == bo, 3286 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3287 bp = TAILQ_LAST(&bv->bv_hd, buflists); 3288 KASSERT(bp == NULL || bp->b_bufobj == bo, 3289 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3290 #endif 3291 BO_UNLOCK(bo); 3292 } 3293 3294 static void 3295 v_init_counters(struct vnode *vp) 3296 { 3297 3298 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 3299 vp, ("%s called for an initialized vnode", __FUNCTION__)); 3300 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 3301 3302 refcount_init(&vp->v_holdcnt, 1); 3303 refcount_init(&vp->v_usecount, 1); 3304 } 3305 3306 /* 3307 * Get a usecount on a vnode. 3308 * 3309 * vget and vget_finish may fail to lock the vnode if they lose a race against 3310 * it being doomed. LK_RETRY can be passed in flags to lock it anyway. 3311 * 3312 * Consumers which don't guarantee liveness of the vnode can use SMR to 3313 * try to get a reference. Note this operation can fail since the vnode 3314 * may be awaiting getting freed by the time they get to it. 3315 */ 3316 enum vgetstate 3317 vget_prep_smr(struct vnode *vp) 3318 { 3319 enum vgetstate vs; 3320 3321 VFS_SMR_ASSERT_ENTERED(); 3322 3323 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3324 vs = VGET_USECOUNT; 3325 } else { 3326 if (vhold_smr(vp)) 3327 vs = VGET_HOLDCNT; 3328 else 3329 vs = VGET_NONE; 3330 } 3331 return (vs); 3332 } 3333 3334 enum vgetstate 3335 vget_prep(struct vnode *vp) 3336 { 3337 enum vgetstate vs; 3338 3339 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3340 vs = VGET_USECOUNT; 3341 } else { 3342 vhold(vp); 3343 vs = VGET_HOLDCNT; 3344 } 3345 return (vs); 3346 } 3347 3348 void 3349 vget_abort(struct vnode *vp, enum vgetstate vs) 3350 { 3351 3352 switch (vs) { 3353 case VGET_USECOUNT: 3354 vrele(vp); 3355 goto out_ok; 3356 case VGET_HOLDCNT: 3357 vdrop(vp); 3358 goto out_ok; 3359 case VGET_NONE: 3360 break; 3361 } 3362 3363 __assert_unreachable(); 3364 3365 /* 3366 * This is a goto label should the cases above have more in common than 3367 * just the 'return' statement. 3368 */ 3369 out_ok: 3370 return; 3371 } 3372 3373 int 3374 vget(struct vnode *vp, int flags) 3375 { 3376 enum vgetstate vs; 3377 3378 vs = vget_prep(vp); 3379 return (vget_finish(vp, flags, vs)); 3380 } 3381 3382 int 3383 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 3384 { 3385 int error; 3386 3387 if ((flags & LK_INTERLOCK) != 0) 3388 ASSERT_VI_LOCKED(vp, __func__); 3389 else 3390 ASSERT_VI_UNLOCKED(vp, __func__); 3391 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3392 VNPASS(vp->v_holdcnt > 0, vp); 3393 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3394 3395 error = vn_lock(vp, flags); 3396 if (__predict_false(error != 0)) { 3397 vget_abort(vp, vs); 3398 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 3399 vp); 3400 return (error); 3401 } 3402 3403 vget_finish_ref(vp, vs); 3404 return (0); 3405 } 3406 3407 void 3408 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 3409 { 3410 int old; 3411 3412 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3413 VNPASS(vp->v_holdcnt > 0, vp); 3414 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3415 3416 if (vs == VGET_USECOUNT) 3417 return; 3418 3419 /* 3420 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3421 * the vnode around. Otherwise someone else lended their hold count and 3422 * we have to drop ours. 3423 */ 3424 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3425 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3426 if (old != 0) { 3427 #ifdef INVARIANTS 3428 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3429 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3430 #else 3431 refcount_release(&vp->v_holdcnt); 3432 #endif 3433 } 3434 } 3435 3436 void 3437 vref(struct vnode *vp) 3438 { 3439 enum vgetstate vs; 3440 3441 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3442 vs = vget_prep(vp); 3443 vget_finish_ref(vp, vs); 3444 } 3445 3446 void 3447 vrefact(struct vnode *vp) 3448 { 3449 int old __diagused; 3450 3451 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3452 old = refcount_acquire(&vp->v_usecount); 3453 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3454 } 3455 3456 void 3457 vlazy(struct vnode *vp) 3458 { 3459 struct mount *mp; 3460 3461 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3462 3463 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3464 return; 3465 /* 3466 * We may get here for inactive routines after the vnode got doomed. 3467 */ 3468 if (VN_IS_DOOMED(vp)) 3469 return; 3470 mp = vp->v_mount; 3471 mtx_lock(&mp->mnt_listmtx); 3472 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3473 vp->v_mflag |= VMP_LAZYLIST; 3474 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3475 mp->mnt_lazyvnodelistsize++; 3476 } 3477 mtx_unlock(&mp->mnt_listmtx); 3478 } 3479 3480 static void 3481 vunlazy(struct vnode *vp) 3482 { 3483 struct mount *mp; 3484 3485 ASSERT_VI_LOCKED(vp, __func__); 3486 VNPASS(!VN_IS_DOOMED(vp), vp); 3487 3488 mp = vp->v_mount; 3489 mtx_lock(&mp->mnt_listmtx); 3490 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3491 /* 3492 * Don't remove the vnode from the lazy list if another thread 3493 * has increased the hold count. It may have re-enqueued the 3494 * vnode to the lazy list and is now responsible for its 3495 * removal. 3496 */ 3497 if (vp->v_holdcnt == 0) { 3498 vp->v_mflag &= ~VMP_LAZYLIST; 3499 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3500 mp->mnt_lazyvnodelistsize--; 3501 } 3502 mtx_unlock(&mp->mnt_listmtx); 3503 } 3504 3505 /* 3506 * This routine is only meant to be called from vgonel prior to dooming 3507 * the vnode. 3508 */ 3509 static void 3510 vunlazy_gone(struct vnode *vp) 3511 { 3512 struct mount *mp; 3513 3514 ASSERT_VOP_ELOCKED(vp, __func__); 3515 ASSERT_VI_LOCKED(vp, __func__); 3516 VNPASS(!VN_IS_DOOMED(vp), vp); 3517 3518 if (vp->v_mflag & VMP_LAZYLIST) { 3519 mp = vp->v_mount; 3520 mtx_lock(&mp->mnt_listmtx); 3521 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3522 vp->v_mflag &= ~VMP_LAZYLIST; 3523 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3524 mp->mnt_lazyvnodelistsize--; 3525 mtx_unlock(&mp->mnt_listmtx); 3526 } 3527 } 3528 3529 static void 3530 vdefer_inactive(struct vnode *vp) 3531 { 3532 3533 ASSERT_VI_LOCKED(vp, __func__); 3534 VNPASS(vp->v_holdcnt > 0, vp); 3535 if (VN_IS_DOOMED(vp)) { 3536 vdropl(vp); 3537 return; 3538 } 3539 if (vp->v_iflag & VI_DEFINACT) { 3540 VNPASS(vp->v_holdcnt > 1, vp); 3541 vdropl(vp); 3542 return; 3543 } 3544 if (vp->v_usecount > 0) { 3545 vp->v_iflag &= ~VI_OWEINACT; 3546 vdropl(vp); 3547 return; 3548 } 3549 vlazy(vp); 3550 vp->v_iflag |= VI_DEFINACT; 3551 VI_UNLOCK(vp); 3552 atomic_add_long(&deferred_inact, 1); 3553 } 3554 3555 static void 3556 vdefer_inactive_unlocked(struct vnode *vp) 3557 { 3558 3559 VI_LOCK(vp); 3560 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3561 vdropl(vp); 3562 return; 3563 } 3564 vdefer_inactive(vp); 3565 } 3566 3567 enum vput_op { VRELE, VPUT, VUNREF }; 3568 3569 /* 3570 * Handle ->v_usecount transitioning to 0. 3571 * 3572 * By releasing the last usecount we take ownership of the hold count which 3573 * provides liveness of the vnode, meaning we have to vdrop. 3574 * 3575 * For all vnodes we may need to perform inactive processing. It requires an 3576 * exclusive lock on the vnode, while it is legal to call here with only a 3577 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3578 * inactive processing gets deferred to the syncer. 3579 */ 3580 static void 3581 vput_final(struct vnode *vp, enum vput_op func) 3582 { 3583 int error; 3584 bool want_unlock; 3585 3586 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3587 VNPASS(vp->v_holdcnt > 0, vp); 3588 3589 VI_LOCK(vp); 3590 3591 /* 3592 * By the time we got here someone else might have transitioned 3593 * the count back to > 0. 3594 */ 3595 if (vp->v_usecount > 0) 3596 goto out; 3597 3598 /* 3599 * If the vnode is doomed vgone already performed inactive processing 3600 * (if needed). 3601 */ 3602 if (VN_IS_DOOMED(vp)) 3603 goto out; 3604 3605 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3606 goto out; 3607 3608 if (vp->v_iflag & VI_DOINGINACT) 3609 goto out; 3610 3611 /* 3612 * Locking operations here will drop the interlock and possibly the 3613 * vnode lock, opening a window where the vnode can get doomed all the 3614 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3615 * perform inactive. 3616 */ 3617 vp->v_iflag |= VI_OWEINACT; 3618 want_unlock = false; 3619 error = 0; 3620 switch (func) { 3621 case VRELE: 3622 switch (VOP_ISLOCKED(vp)) { 3623 case LK_EXCLUSIVE: 3624 break; 3625 case LK_EXCLOTHER: 3626 case 0: 3627 want_unlock = true; 3628 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3629 VI_LOCK(vp); 3630 break; 3631 default: 3632 /* 3633 * The lock has at least one sharer, but we have no way 3634 * to conclude whether this is us. Play it safe and 3635 * defer processing. 3636 */ 3637 error = EAGAIN; 3638 break; 3639 } 3640 break; 3641 case VPUT: 3642 want_unlock = true; 3643 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3644 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3645 LK_NOWAIT); 3646 VI_LOCK(vp); 3647 } 3648 break; 3649 case VUNREF: 3650 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3651 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3652 VI_LOCK(vp); 3653 } 3654 break; 3655 } 3656 if (error != 0) { 3657 vdefer_inactive(vp); 3658 return; 3659 } 3660 if (func == VUNREF) { 3661 VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, 3662 ("recursive vunref")); 3663 vp->v_vflag |= VV_UNREF; 3664 } 3665 for (;;) { 3666 error = vinactive(vp); 3667 if (want_unlock) 3668 VOP_UNLOCK(vp); 3669 if (error != ERELOOKUP || !want_unlock) 3670 break; 3671 VOP_LOCK(vp, LK_EXCLUSIVE); 3672 } 3673 if (func == VUNREF) 3674 vp->v_vflag &= ~VV_UNREF; 3675 vdropl(vp); 3676 return; 3677 out: 3678 if (func == VPUT) 3679 VOP_UNLOCK(vp); 3680 vdropl(vp); 3681 } 3682 3683 /* 3684 * Decrement ->v_usecount for a vnode. 3685 * 3686 * Releasing the last use count requires additional processing, see vput_final 3687 * above for details. 3688 * 3689 * Comment above each variant denotes lock state on entry and exit. 3690 */ 3691 3692 /* 3693 * in: any 3694 * out: same as passed in 3695 */ 3696 void 3697 vrele(struct vnode *vp) 3698 { 3699 3700 ASSERT_VI_UNLOCKED(vp, __func__); 3701 if (!refcount_release(&vp->v_usecount)) 3702 return; 3703 vput_final(vp, VRELE); 3704 } 3705 3706 /* 3707 * in: locked 3708 * out: unlocked 3709 */ 3710 void 3711 vput(struct vnode *vp) 3712 { 3713 3714 ASSERT_VOP_LOCKED(vp, __func__); 3715 ASSERT_VI_UNLOCKED(vp, __func__); 3716 if (!refcount_release(&vp->v_usecount)) { 3717 VOP_UNLOCK(vp); 3718 return; 3719 } 3720 vput_final(vp, VPUT); 3721 } 3722 3723 /* 3724 * in: locked 3725 * out: locked 3726 */ 3727 void 3728 vunref(struct vnode *vp) 3729 { 3730 3731 ASSERT_VOP_LOCKED(vp, __func__); 3732 ASSERT_VI_UNLOCKED(vp, __func__); 3733 if (!refcount_release(&vp->v_usecount)) 3734 return; 3735 vput_final(vp, VUNREF); 3736 } 3737 3738 void 3739 vhold(struct vnode *vp) 3740 { 3741 int old; 3742 3743 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3744 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3745 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3746 ("%s: wrong hold count %d", __func__, old)); 3747 if (old == 0) 3748 vfs_freevnodes_dec(); 3749 } 3750 3751 void 3752 vholdnz(struct vnode *vp) 3753 { 3754 3755 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3756 #ifdef INVARIANTS 3757 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3758 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3759 ("%s: wrong hold count %d", __func__, old)); 3760 #else 3761 atomic_add_int(&vp->v_holdcnt, 1); 3762 #endif 3763 } 3764 3765 /* 3766 * Grab a hold count unless the vnode is freed. 3767 * 3768 * Only use this routine if vfs smr is the only protection you have against 3769 * freeing the vnode. 3770 * 3771 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3772 * is not set. After the flag is set the vnode becomes immutable to anyone but 3773 * the thread which managed to set the flag. 3774 * 3775 * It may be tempting to replace the loop with: 3776 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3777 * if (count & VHOLD_NO_SMR) { 3778 * backpedal and error out; 3779 * } 3780 * 3781 * However, while this is more performant, it hinders debugging by eliminating 3782 * the previously mentioned invariant. 3783 */ 3784 bool 3785 vhold_smr(struct vnode *vp) 3786 { 3787 int count; 3788 3789 VFS_SMR_ASSERT_ENTERED(); 3790 3791 count = atomic_load_int(&vp->v_holdcnt); 3792 for (;;) { 3793 if (count & VHOLD_NO_SMR) { 3794 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3795 ("non-zero hold count with flags %d\n", count)); 3796 return (false); 3797 } 3798 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3799 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3800 if (count == 0) 3801 vfs_freevnodes_dec(); 3802 return (true); 3803 } 3804 } 3805 } 3806 3807 /* 3808 * Hold a free vnode for recycling. 3809 * 3810 * Note: vnode_init references this comment. 3811 * 3812 * Attempts to recycle only need the global vnode list lock and have no use for 3813 * SMR. 3814 * 3815 * However, vnodes get inserted into the global list before they get fully 3816 * initialized and stay there until UMA decides to free the memory. This in 3817 * particular means the target can be found before it becomes usable and after 3818 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to 3819 * VHOLD_NO_SMR. 3820 * 3821 * Note: the vnode may gain more references after we transition the count 0->1. 3822 */ 3823 static bool 3824 vhold_recycle_free(struct vnode *vp) 3825 { 3826 int count; 3827 3828 mtx_assert(&vnode_list_mtx, MA_OWNED); 3829 3830 count = atomic_load_int(&vp->v_holdcnt); 3831 for (;;) { 3832 if (count & VHOLD_NO_SMR) { 3833 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3834 ("non-zero hold count with flags %d\n", count)); 3835 return (false); 3836 } 3837 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3838 if (count > 0) { 3839 return (false); 3840 } 3841 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3842 vfs_freevnodes_dec(); 3843 return (true); 3844 } 3845 } 3846 } 3847 3848 static void __noinline 3849 vdbatch_process(struct vdbatch *vd) 3850 { 3851 struct vnode *vp; 3852 int i; 3853 3854 mtx_assert(&vd->lock, MA_OWNED); 3855 MPASS(curthread->td_pinned > 0); 3856 MPASS(vd->index == VDBATCH_SIZE); 3857 3858 /* 3859 * Attempt to requeue the passed batch, but give up easily. 3860 * 3861 * Despite batching the mechanism is prone to transient *significant* 3862 * lock contention, where vnode_list_mtx becomes the primary bottleneck 3863 * if multiple CPUs get here (one real-world example is highly parallel 3864 * do-nothing make , which will stat *tons* of vnodes). Since it is 3865 * quasi-LRU (read: not that great even if fully honoured) provide an 3866 * option to just dodge the problem. Parties which don't like it are 3867 * welcome to implement something better. 3868 */ 3869 if (vnode_can_skip_requeue) { 3870 if (!mtx_trylock(&vnode_list_mtx)) { 3871 counter_u64_add(vnode_skipped_requeues, 1); 3872 critical_enter(); 3873 for (i = 0; i < VDBATCH_SIZE; i++) { 3874 vp = vd->tab[i]; 3875 vd->tab[i] = NULL; 3876 MPASS(vp->v_dbatchcpu != NOCPU); 3877 vp->v_dbatchcpu = NOCPU; 3878 } 3879 vd->index = 0; 3880 critical_exit(); 3881 return; 3882 3883 } 3884 /* fallthrough to locked processing */ 3885 } else { 3886 mtx_lock(&vnode_list_mtx); 3887 } 3888 3889 mtx_assert(&vnode_list_mtx, MA_OWNED); 3890 critical_enter(); 3891 for (i = 0; i < VDBATCH_SIZE; i++) { 3892 vp = vd->tab[i]; 3893 vd->tab[i] = NULL; 3894 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3895 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3896 MPASS(vp->v_dbatchcpu != NOCPU); 3897 vp->v_dbatchcpu = NOCPU; 3898 } 3899 mtx_unlock(&vnode_list_mtx); 3900 vd->index = 0; 3901 critical_exit(); 3902 } 3903 3904 static void 3905 vdbatch_enqueue(struct vnode *vp) 3906 { 3907 struct vdbatch *vd; 3908 3909 ASSERT_VI_LOCKED(vp, __func__); 3910 VNPASS(!VN_IS_DOOMED(vp), vp); 3911 3912 if (vp->v_dbatchcpu != NOCPU) { 3913 VI_UNLOCK(vp); 3914 return; 3915 } 3916 3917 sched_pin(); 3918 vd = DPCPU_PTR(vd); 3919 mtx_lock(&vd->lock); 3920 MPASS(vd->index < VDBATCH_SIZE); 3921 MPASS(vd->tab[vd->index] == NULL); 3922 /* 3923 * A hack: we depend on being pinned so that we know what to put in 3924 * ->v_dbatchcpu. 3925 */ 3926 vp->v_dbatchcpu = curcpu; 3927 vd->tab[vd->index] = vp; 3928 vd->index++; 3929 VI_UNLOCK(vp); 3930 if (vd->index == VDBATCH_SIZE) 3931 vdbatch_process(vd); 3932 mtx_unlock(&vd->lock); 3933 sched_unpin(); 3934 } 3935 3936 /* 3937 * This routine must only be called for vnodes which are about to be 3938 * deallocated. Supporting dequeue for arbitrary vndoes would require 3939 * validating that the locked batch matches. 3940 */ 3941 static void 3942 vdbatch_dequeue(struct vnode *vp) 3943 { 3944 struct vdbatch *vd; 3945 int i; 3946 short cpu; 3947 3948 VNPASS(vp->v_type == VBAD || vp->v_type == VNON, vp); 3949 3950 cpu = vp->v_dbatchcpu; 3951 if (cpu == NOCPU) 3952 return; 3953 3954 vd = DPCPU_ID_PTR(cpu, vd); 3955 mtx_lock(&vd->lock); 3956 for (i = 0; i < vd->index; i++) { 3957 if (vd->tab[i] != vp) 3958 continue; 3959 vp->v_dbatchcpu = NOCPU; 3960 vd->index--; 3961 vd->tab[i] = vd->tab[vd->index]; 3962 vd->tab[vd->index] = NULL; 3963 break; 3964 } 3965 mtx_unlock(&vd->lock); 3966 /* 3967 * Either we dequeued the vnode above or the target CPU beat us to it. 3968 */ 3969 MPASS(vp->v_dbatchcpu == NOCPU); 3970 } 3971 3972 /* 3973 * Drop the hold count of the vnode. 3974 * 3975 * It will only get freed if this is the last hold *and* it has been vgone'd. 3976 * 3977 * Because the vnode vm object keeps a hold reference on the vnode if 3978 * there is at least one resident non-cached page, the vnode cannot 3979 * leave the active list without the page cleanup done. 3980 */ 3981 static void __noinline 3982 vdropl_final(struct vnode *vp) 3983 { 3984 3985 ASSERT_VI_LOCKED(vp, __func__); 3986 VNPASS(VN_IS_DOOMED(vp), vp); 3987 /* 3988 * Set the VHOLD_NO_SMR flag. 3989 * 3990 * We may be racing against vhold_smr. If they win we can just pretend 3991 * we never got this far, they will vdrop later. 3992 */ 3993 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { 3994 vfs_freevnodes_inc(); 3995 VI_UNLOCK(vp); 3996 /* 3997 * We lost the aforementioned race. Any subsequent access is 3998 * invalid as they might have managed to vdropl on their own. 3999 */ 4000 return; 4001 } 4002 /* 4003 * Don't bump freevnodes as this one is going away. 4004 */ 4005 freevnode(vp); 4006 } 4007 4008 void 4009 vdrop(struct vnode *vp) 4010 { 4011 4012 ASSERT_VI_UNLOCKED(vp, __func__); 4013 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4014 if (refcount_release_if_not_last(&vp->v_holdcnt)) 4015 return; 4016 VI_LOCK(vp); 4017 vdropl(vp); 4018 } 4019 4020 static __always_inline void 4021 vdropl_impl(struct vnode *vp, bool enqueue) 4022 { 4023 4024 ASSERT_VI_LOCKED(vp, __func__); 4025 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4026 if (!refcount_release(&vp->v_holdcnt)) { 4027 VI_UNLOCK(vp); 4028 return; 4029 } 4030 VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); 4031 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 4032 if (VN_IS_DOOMED(vp)) { 4033 vdropl_final(vp); 4034 return; 4035 } 4036 4037 vfs_freevnodes_inc(); 4038 if (vp->v_mflag & VMP_LAZYLIST) { 4039 vunlazy(vp); 4040 } 4041 4042 if (!enqueue) { 4043 VI_UNLOCK(vp); 4044 return; 4045 } 4046 4047 /* 4048 * Also unlocks the interlock. We can't assert on it as we 4049 * released our hold and by now the vnode might have been 4050 * freed. 4051 */ 4052 vdbatch_enqueue(vp); 4053 } 4054 4055 void 4056 vdropl(struct vnode *vp) 4057 { 4058 4059 vdropl_impl(vp, true); 4060 } 4061 4062 /* 4063 * vdrop a vnode when recycling 4064 * 4065 * This is a special case routine only to be used when recycling, differs from 4066 * regular vdrop by not requeieing the vnode on LRU. 4067 * 4068 * Consider a case where vtryrecycle continuously fails with all vnodes (due to 4069 * e.g., frozen writes on the filesystem), filling the batch and causing it to 4070 * be requeued. Then vnlru will end up revisiting the same vnodes. This is a 4071 * loop which can last for as long as writes are frozen. 4072 */ 4073 static void 4074 vdropl_recycle(struct vnode *vp) 4075 { 4076 4077 vdropl_impl(vp, false); 4078 } 4079 4080 static void 4081 vdrop_recycle(struct vnode *vp) 4082 { 4083 4084 VI_LOCK(vp); 4085 vdropl_recycle(vp); 4086 } 4087 4088 /* 4089 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 4090 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 4091 */ 4092 static int 4093 vinactivef(struct vnode *vp) 4094 { 4095 int error; 4096 4097 ASSERT_VOP_ELOCKED(vp, "vinactive"); 4098 ASSERT_VI_LOCKED(vp, "vinactive"); 4099 VNPASS((vp->v_iflag & VI_DOINGINACT) == 0, vp); 4100 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4101 vp->v_iflag |= VI_DOINGINACT; 4102 vp->v_iflag &= ~VI_OWEINACT; 4103 VI_UNLOCK(vp); 4104 4105 /* 4106 * Before moving off the active list, we must be sure that any 4107 * modified pages are converted into the vnode's dirty 4108 * buffers, since these will no longer be checked once the 4109 * vnode is on the inactive list. 4110 * 4111 * The write-out of the dirty pages is asynchronous. At the 4112 * point that VOP_INACTIVE() is called, there could still be 4113 * pending I/O and dirty pages in the object. 4114 */ 4115 if ((vp->v_vflag & VV_NOSYNC) == 0) 4116 vnode_pager_clean_async(vp); 4117 4118 error = VOP_INACTIVE(vp); 4119 VI_LOCK(vp); 4120 VNPASS(vp->v_iflag & VI_DOINGINACT, vp); 4121 vp->v_iflag &= ~VI_DOINGINACT; 4122 return (error); 4123 } 4124 4125 int 4126 vinactive(struct vnode *vp) 4127 { 4128 4129 ASSERT_VOP_ELOCKED(vp, "vinactive"); 4130 ASSERT_VI_LOCKED(vp, "vinactive"); 4131 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4132 4133 if ((vp->v_iflag & VI_OWEINACT) == 0) 4134 return (0); 4135 if (vp->v_iflag & VI_DOINGINACT) 4136 return (0); 4137 if (vp->v_usecount > 0) { 4138 vp->v_iflag &= ~VI_OWEINACT; 4139 return (0); 4140 } 4141 return (vinactivef(vp)); 4142 } 4143 4144 /* 4145 * Remove any vnodes in the vnode table belonging to mount point mp. 4146 * 4147 * If FORCECLOSE is not specified, there should not be any active ones, 4148 * return error if any are found (nb: this is a user error, not a 4149 * system error). If FORCECLOSE is specified, detach any active vnodes 4150 * that are found. 4151 * 4152 * If WRITECLOSE is set, only flush out regular file vnodes open for 4153 * writing. 4154 * 4155 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 4156 * 4157 * `rootrefs' specifies the base reference count for the root vnode 4158 * of this filesystem. The root vnode is considered busy if its 4159 * v_usecount exceeds this value. On a successful return, vflush(, td) 4160 * will call vrele() on the root vnode exactly rootrefs times. 4161 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 4162 * be zero. 4163 */ 4164 #ifdef DIAGNOSTIC 4165 static int busyprt = 0; /* print out busy vnodes */ 4166 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 4167 #endif 4168 4169 int 4170 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 4171 { 4172 struct vnode *vp, *mvp, *rootvp = NULL; 4173 struct vattr vattr; 4174 int busy = 0, error; 4175 4176 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 4177 rootrefs, flags); 4178 if (rootrefs > 0) { 4179 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 4180 ("vflush: bad args")); 4181 /* 4182 * Get the filesystem root vnode. We can vput() it 4183 * immediately, since with rootrefs > 0, it won't go away. 4184 */ 4185 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 4186 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 4187 __func__, error); 4188 return (error); 4189 } 4190 vput(rootvp); 4191 } 4192 loop: 4193 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 4194 vholdl(vp); 4195 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 4196 if (error) { 4197 vdrop(vp); 4198 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 4199 goto loop; 4200 } 4201 /* 4202 * Skip over a vnodes marked VV_SYSTEM. 4203 */ 4204 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 4205 VOP_UNLOCK(vp); 4206 vdrop(vp); 4207 continue; 4208 } 4209 /* 4210 * If WRITECLOSE is set, flush out unlinked but still open 4211 * files (even if open only for reading) and regular file 4212 * vnodes open for writing. 4213 */ 4214 if (flags & WRITECLOSE) { 4215 vnode_pager_clean_async(vp); 4216 do { 4217 error = VOP_FSYNC(vp, MNT_WAIT, td); 4218 } while (error == ERELOOKUP); 4219 if (error != 0) { 4220 VOP_UNLOCK(vp); 4221 vdrop(vp); 4222 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 4223 return (error); 4224 } 4225 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 4226 VI_LOCK(vp); 4227 4228 if ((vp->v_type == VNON || 4229 (error == 0 && vattr.va_nlink > 0)) && 4230 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 4231 VOP_UNLOCK(vp); 4232 vdropl(vp); 4233 continue; 4234 } 4235 } else 4236 VI_LOCK(vp); 4237 /* 4238 * With v_usecount == 0, all we need to do is clear out the 4239 * vnode data structures and we are done. 4240 * 4241 * If FORCECLOSE is set, forcibly close the vnode. 4242 */ 4243 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 4244 vgonel(vp); 4245 } else { 4246 busy++; 4247 #ifdef DIAGNOSTIC 4248 if (busyprt) 4249 vn_printf(vp, "vflush: busy vnode "); 4250 #endif 4251 } 4252 VOP_UNLOCK(vp); 4253 vdropl(vp); 4254 } 4255 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 4256 /* 4257 * If just the root vnode is busy, and if its refcount 4258 * is equal to `rootrefs', then go ahead and kill it. 4259 */ 4260 VI_LOCK(rootvp); 4261 KASSERT(busy > 0, ("vflush: not busy")); 4262 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 4263 ("vflush: usecount %d < rootrefs %d", 4264 rootvp->v_usecount, rootrefs)); 4265 if (busy == 1 && rootvp->v_usecount == rootrefs) { 4266 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 4267 vgone(rootvp); 4268 VOP_UNLOCK(rootvp); 4269 busy = 0; 4270 } else 4271 VI_UNLOCK(rootvp); 4272 } 4273 if (busy) { 4274 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 4275 busy); 4276 return (EBUSY); 4277 } 4278 for (; rootrefs > 0; rootrefs--) 4279 vrele(rootvp); 4280 return (0); 4281 } 4282 4283 /* 4284 * Recycle an unused vnode. 4285 */ 4286 int 4287 vrecycle(struct vnode *vp) 4288 { 4289 int recycled; 4290 4291 VI_LOCK(vp); 4292 recycled = vrecyclel(vp); 4293 VI_UNLOCK(vp); 4294 return (recycled); 4295 } 4296 4297 /* 4298 * vrecycle, with the vp interlock held. 4299 */ 4300 int 4301 vrecyclel(struct vnode *vp) 4302 { 4303 int recycled; 4304 4305 ASSERT_VOP_ELOCKED(vp, __func__); 4306 ASSERT_VI_LOCKED(vp, __func__); 4307 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4308 recycled = 0; 4309 if (vp->v_usecount == 0) { 4310 recycled = 1; 4311 vgonel(vp); 4312 } 4313 return (recycled); 4314 } 4315 4316 /* 4317 * Eliminate all activity associated with a vnode 4318 * in preparation for reuse. 4319 */ 4320 void 4321 vgone(struct vnode *vp) 4322 { 4323 VI_LOCK(vp); 4324 vgonel(vp); 4325 VI_UNLOCK(vp); 4326 } 4327 4328 /* 4329 * Notify upper mounts about reclaimed or unlinked vnode. 4330 */ 4331 void 4332 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) 4333 { 4334 struct mount *mp; 4335 struct mount_upper_node *ump; 4336 4337 mp = atomic_load_ptr(&vp->v_mount); 4338 if (mp == NULL) 4339 return; 4340 if (TAILQ_EMPTY(&mp->mnt_notify)) 4341 return; 4342 4343 MNT_ILOCK(mp); 4344 mp->mnt_upper_pending++; 4345 KASSERT(mp->mnt_upper_pending > 0, 4346 ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); 4347 TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { 4348 MNT_IUNLOCK(mp); 4349 switch (event) { 4350 case VFS_NOTIFY_UPPER_RECLAIM: 4351 VFS_RECLAIM_LOWERVP(ump->mp, vp); 4352 break; 4353 case VFS_NOTIFY_UPPER_UNLINK: 4354 VFS_UNLINK_LOWERVP(ump->mp, vp); 4355 break; 4356 } 4357 MNT_ILOCK(mp); 4358 } 4359 mp->mnt_upper_pending--; 4360 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 4361 mp->mnt_upper_pending == 0) { 4362 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 4363 wakeup(&mp->mnt_uppers); 4364 } 4365 MNT_IUNLOCK(mp); 4366 } 4367 4368 /* 4369 * vgone, with the vp interlock held. 4370 */ 4371 static void 4372 vgonel(struct vnode *vp) 4373 { 4374 struct thread *td; 4375 struct mount *mp; 4376 vm_object_t object; 4377 bool active, doinginact, oweinact; 4378 4379 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4380 ASSERT_VI_LOCKED(vp, "vgonel"); 4381 VNASSERT(vp->v_holdcnt, vp, 4382 ("vgonel: vp %p has no reference.", vp)); 4383 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4384 td = curthread; 4385 4386 /* 4387 * Don't vgonel if we're already doomed. 4388 */ 4389 if (VN_IS_DOOMED(vp)) { 4390 VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \ 4391 vn_get_state(vp) == VSTATE_DEAD, vp); 4392 return; 4393 } 4394 /* 4395 * Paired with freevnode. 4396 */ 4397 vn_seqc_write_begin_locked(vp); 4398 vunlazy_gone(vp); 4399 vn_irflag_set_locked(vp, VIRF_DOOMED); 4400 vn_set_state(vp, VSTATE_DESTROYING); 4401 4402 /* 4403 * Check to see if the vnode is in use. If so, we have to 4404 * call VOP_CLOSE() and VOP_INACTIVE(). 4405 * 4406 * It could be that VOP_INACTIVE() requested reclamation, in 4407 * which case we should avoid recursion, so check 4408 * VI_DOINGINACT. This is not precise but good enough. 4409 */ 4410 active = vp->v_usecount > 0; 4411 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4412 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; 4413 4414 /* 4415 * If we need to do inactive VI_OWEINACT will be set. 4416 */ 4417 if (vp->v_iflag & VI_DEFINACT) { 4418 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4419 vp->v_iflag &= ~VI_DEFINACT; 4420 vdropl(vp); 4421 } else { 4422 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4423 VI_UNLOCK(vp); 4424 } 4425 cache_purge_vgone(vp); 4426 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4427 4428 /* 4429 * If purging an active vnode, it must be closed and 4430 * deactivated before being reclaimed. 4431 */ 4432 if (active) 4433 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4434 if (!doinginact) { 4435 do { 4436 if (oweinact || active) { 4437 VI_LOCK(vp); 4438 vinactivef(vp); 4439 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4440 VI_UNLOCK(vp); 4441 } 4442 } while (oweinact); 4443 } 4444 if (vp->v_type == VSOCK) 4445 vfs_unp_reclaim(vp); 4446 4447 /* 4448 * Clean out any buffers associated with the vnode. 4449 * If the flush fails, just toss the buffers. 4450 */ 4451 mp = NULL; 4452 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4453 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4454 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4455 while (vinvalbuf(vp, 0, 0, 0) != 0) 4456 ; 4457 } 4458 4459 BO_LOCK(&vp->v_bufobj); 4460 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4461 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4462 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4463 vp->v_bufobj.bo_clean.bv_cnt == 0, 4464 ("vp %p bufobj not invalidated", vp)); 4465 4466 /* 4467 * For VMIO bufobj, BO_DEAD is set later, or in 4468 * vm_object_terminate() after the object's page queue is 4469 * flushed. 4470 */ 4471 object = vp->v_bufobj.bo_object; 4472 if (object == NULL) 4473 vp->v_bufobj.bo_flag |= BO_DEAD; 4474 BO_UNLOCK(&vp->v_bufobj); 4475 4476 /* 4477 * Handle the VM part. Tmpfs handles v_object on its own (the 4478 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4479 * should not touch the object borrowed from the lower vnode 4480 * (the handle check). 4481 */ 4482 if (object != NULL && object->type == OBJT_VNODE && 4483 object->handle == vp) 4484 vnode_destroy_vobject(vp); 4485 4486 /* 4487 * Reclaim the vnode. 4488 */ 4489 if (VOP_RECLAIM(vp)) 4490 panic("vgone: cannot reclaim"); 4491 if (mp != NULL) 4492 vn_finished_secondary_write(mp); 4493 VNASSERT(vp->v_object == NULL, vp, 4494 ("vop_reclaim left v_object vp=%p", vp)); 4495 /* 4496 * Clear the advisory locks and wake up waiting threads. 4497 */ 4498 if (vp->v_lockf != NULL) { 4499 (void)VOP_ADVLOCKPURGE(vp); 4500 vp->v_lockf = NULL; 4501 } 4502 /* 4503 * Delete from old mount point vnode list. 4504 */ 4505 if (vp->v_mount == NULL) { 4506 VI_LOCK(vp); 4507 } else { 4508 delmntque(vp); 4509 ASSERT_VI_LOCKED(vp, "vgonel 2"); 4510 } 4511 /* 4512 * Done with purge, reset to the standard lock and invalidate 4513 * the vnode. 4514 * 4515 * FIXME: this is buggy for vnode ops with custom locking primitives. 4516 * 4517 * vget used to be gated with a special flag serializing it against vgone, 4518 * which got lost in the process of SMP-ifying the VFS layer. 4519 * 4520 * Suppose a custom locking routine references ->v_data. 4521 * 4522 * Since now it is possible to start executing it as vgone is 4523 * progressing, this very well may crash as ->v_data gets invalidated 4524 * and memory used to back it is freed. 4525 */ 4526 vp->v_vnlock = &vp->v_lock; 4527 vp->v_op = &dead_vnodeops; 4528 vp->v_type = VBAD; 4529 vn_set_state(vp, VSTATE_DEAD); 4530 } 4531 4532 /* 4533 * Print out a description of a vnode. 4534 */ 4535 static const char *const vtypename[] = { 4536 [VNON] = "VNON", 4537 [VREG] = "VREG", 4538 [VDIR] = "VDIR", 4539 [VBLK] = "VBLK", 4540 [VCHR] = "VCHR", 4541 [VLNK] = "VLNK", 4542 [VSOCK] = "VSOCK", 4543 [VFIFO] = "VFIFO", 4544 [VBAD] = "VBAD", 4545 [VMARKER] = "VMARKER", 4546 }; 4547 _Static_assert(nitems(vtypename) == VLASTTYPE + 1, 4548 "vnode type name not added to vtypename"); 4549 4550 static const char *const vstatename[] = { 4551 [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED", 4552 [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED", 4553 [VSTATE_DESTROYING] = "VSTATE_DESTROYING", 4554 [VSTATE_DEAD] = "VSTATE_DEAD", 4555 }; 4556 _Static_assert(nitems(vstatename) == VLASTSTATE + 1, 4557 "vnode state name not added to vstatename"); 4558 4559 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4560 "new hold count flag not added to vn_printf"); 4561 4562 void 4563 vn_printf(struct vnode *vp, const char *fmt, ...) 4564 { 4565 va_list ap; 4566 char buf[256], buf2[16]; 4567 u_long flags; 4568 u_int holdcnt; 4569 short irflag; 4570 4571 va_start(ap, fmt); 4572 vprintf(fmt, ap); 4573 va_end(ap); 4574 printf("%p: ", (void *)vp); 4575 printf("type %s state %s op %p\n", vtypename[vp->v_type], 4576 vstatename[vp->v_state], vp->v_op); 4577 holdcnt = atomic_load_int(&vp->v_holdcnt); 4578 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4579 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4580 vp->v_seqc_users); 4581 switch (vp->v_type) { 4582 case VDIR: 4583 printf(" mountedhere %p\n", vp->v_mountedhere); 4584 break; 4585 case VCHR: 4586 printf(" rdev %p\n", vp->v_rdev); 4587 break; 4588 case VSOCK: 4589 printf(" socket %p\n", vp->v_unpcb); 4590 break; 4591 case VFIFO: 4592 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4593 break; 4594 default: 4595 printf("\n"); 4596 break; 4597 } 4598 buf[0] = '\0'; 4599 buf[1] = '\0'; 4600 if (holdcnt & VHOLD_NO_SMR) 4601 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4602 printf(" hold count flags (%s)\n", buf + 1); 4603 4604 buf[0] = '\0'; 4605 buf[1] = '\0'; 4606 irflag = vn_irflag_read(vp); 4607 if (irflag & VIRF_DOOMED) 4608 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4609 if (irflag & VIRF_PGREAD) 4610 strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); 4611 if (irflag & VIRF_MOUNTPOINT) 4612 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); 4613 if (irflag & VIRF_TEXT_REF) 4614 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); 4615 flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); 4616 if (flags != 0) { 4617 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4618 strlcat(buf, buf2, sizeof(buf)); 4619 } 4620 if (vp->v_vflag & VV_ROOT) 4621 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4622 if (vp->v_vflag & VV_ISTTY) 4623 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4624 if (vp->v_vflag & VV_NOSYNC) 4625 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4626 if (vp->v_vflag & VV_ETERNALDEV) 4627 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4628 if (vp->v_vflag & VV_CACHEDLABEL) 4629 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4630 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4631 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4632 if (vp->v_vflag & VV_COPYONWRITE) 4633 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4634 if (vp->v_vflag & VV_SYSTEM) 4635 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4636 if (vp->v_vflag & VV_PROCDEP) 4637 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4638 if (vp->v_vflag & VV_DELETED) 4639 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4640 if (vp->v_vflag & VV_MD) 4641 strlcat(buf, "|VV_MD", sizeof(buf)); 4642 if (vp->v_vflag & VV_FORCEINSMQ) 4643 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4644 if (vp->v_vflag & VV_READLINK) 4645 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4646 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4647 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | 4648 VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); 4649 if (flags != 0) { 4650 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4651 strlcat(buf, buf2, sizeof(buf)); 4652 } 4653 if (vp->v_iflag & VI_MOUNT) 4654 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4655 if (vp->v_iflag & VI_DOINGINACT) 4656 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4657 if (vp->v_iflag & VI_OWEINACT) 4658 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4659 if (vp->v_iflag & VI_DEFINACT) 4660 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4661 if (vp->v_iflag & VI_FOPENING) 4662 strlcat(buf, "|VI_FOPENING", sizeof(buf)); 4663 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | 4664 VI_OWEINACT | VI_DEFINACT | VI_FOPENING); 4665 if (flags != 0) { 4666 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4667 strlcat(buf, buf2, sizeof(buf)); 4668 } 4669 if (vp->v_mflag & VMP_LAZYLIST) 4670 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4671 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4672 if (flags != 0) { 4673 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4674 strlcat(buf, buf2, sizeof(buf)); 4675 } 4676 printf(" flags (%s)", buf + 1); 4677 if (mtx_owned(VI_MTX(vp))) 4678 printf(" VI_LOCKed"); 4679 printf("\n"); 4680 if (vp->v_object != NULL) 4681 printf(" v_object %p ref %d pages %d " 4682 "cleanbuf %d dirtybuf %d\n", 4683 vp->v_object, vp->v_object->ref_count, 4684 vp->v_object->resident_page_count, 4685 vp->v_bufobj.bo_clean.bv_cnt, 4686 vp->v_bufobj.bo_dirty.bv_cnt); 4687 printf(" "); 4688 lockmgr_printinfo(vp->v_vnlock); 4689 if (vp->v_data != NULL) 4690 VOP_PRINT(vp); 4691 } 4692 4693 #ifdef DDB 4694 /* 4695 * List all of the locked vnodes in the system. 4696 * Called when debugging the kernel. 4697 */ 4698 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) 4699 { 4700 struct mount *mp; 4701 struct vnode *vp; 4702 4703 /* 4704 * Note: because this is DDB, we can't obey the locking semantics 4705 * for these structures, which means we could catch an inconsistent 4706 * state and dereference a nasty pointer. Not much to be done 4707 * about that. 4708 */ 4709 db_printf("Locked vnodes\n"); 4710 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4711 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4712 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4713 vn_printf(vp, "vnode "); 4714 } 4715 } 4716 } 4717 4718 /* 4719 * Show details about the given vnode. 4720 */ 4721 DB_SHOW_COMMAND(vnode, db_show_vnode) 4722 { 4723 struct vnode *vp; 4724 4725 if (!have_addr) 4726 return; 4727 vp = (struct vnode *)addr; 4728 vn_printf(vp, "vnode "); 4729 } 4730 4731 /* 4732 * Show details about the given mount point. 4733 */ 4734 DB_SHOW_COMMAND(mount, db_show_mount) 4735 { 4736 struct mount *mp; 4737 struct vfsopt *opt; 4738 struct statfs *sp; 4739 struct vnode *vp; 4740 char buf[512]; 4741 uint64_t mflags; 4742 u_int flags; 4743 4744 if (!have_addr) { 4745 /* No address given, print short info about all mount points. */ 4746 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4747 db_printf("%p %s on %s (%s)\n", mp, 4748 mp->mnt_stat.f_mntfromname, 4749 mp->mnt_stat.f_mntonname, 4750 mp->mnt_stat.f_fstypename); 4751 if (db_pager_quit) 4752 break; 4753 } 4754 db_printf("\nMore info: show mount <addr>\n"); 4755 return; 4756 } 4757 4758 mp = (struct mount *)addr; 4759 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4760 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4761 4762 buf[0] = '\0'; 4763 mflags = mp->mnt_flag; 4764 #define MNT_FLAG(flag) do { \ 4765 if (mflags & (flag)) { \ 4766 if (buf[0] != '\0') \ 4767 strlcat(buf, ", ", sizeof(buf)); \ 4768 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4769 mflags &= ~(flag); \ 4770 } \ 4771 } while (0) 4772 MNT_FLAG(MNT_RDONLY); 4773 MNT_FLAG(MNT_SYNCHRONOUS); 4774 MNT_FLAG(MNT_NOEXEC); 4775 MNT_FLAG(MNT_NOSUID); 4776 MNT_FLAG(MNT_NFS4ACLS); 4777 MNT_FLAG(MNT_UNION); 4778 MNT_FLAG(MNT_ASYNC); 4779 MNT_FLAG(MNT_SUIDDIR); 4780 MNT_FLAG(MNT_SOFTDEP); 4781 MNT_FLAG(MNT_NOSYMFOLLOW); 4782 MNT_FLAG(MNT_GJOURNAL); 4783 MNT_FLAG(MNT_MULTILABEL); 4784 MNT_FLAG(MNT_ACLS); 4785 MNT_FLAG(MNT_NOATIME); 4786 MNT_FLAG(MNT_NOCLUSTERR); 4787 MNT_FLAG(MNT_NOCLUSTERW); 4788 MNT_FLAG(MNT_SUJ); 4789 MNT_FLAG(MNT_EXRDONLY); 4790 MNT_FLAG(MNT_EXPORTED); 4791 MNT_FLAG(MNT_DEFEXPORTED); 4792 MNT_FLAG(MNT_EXPORTANON); 4793 MNT_FLAG(MNT_EXKERB); 4794 MNT_FLAG(MNT_EXPUBLIC); 4795 MNT_FLAG(MNT_LOCAL); 4796 MNT_FLAG(MNT_QUOTA); 4797 MNT_FLAG(MNT_ROOTFS); 4798 MNT_FLAG(MNT_USER); 4799 MNT_FLAG(MNT_IGNORE); 4800 MNT_FLAG(MNT_UPDATE); 4801 MNT_FLAG(MNT_DELEXPORT); 4802 MNT_FLAG(MNT_RELOAD); 4803 MNT_FLAG(MNT_FORCE); 4804 MNT_FLAG(MNT_SNAPSHOT); 4805 MNT_FLAG(MNT_BYFSID); 4806 MNT_FLAG(MNT_NAMEDATTR); 4807 #undef MNT_FLAG 4808 if (mflags != 0) { 4809 if (buf[0] != '\0') 4810 strlcat(buf, ", ", sizeof(buf)); 4811 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4812 "0x%016jx", mflags); 4813 } 4814 db_printf(" mnt_flag = %s\n", buf); 4815 4816 buf[0] = '\0'; 4817 flags = mp->mnt_kern_flag; 4818 #define MNT_KERN_FLAG(flag) do { \ 4819 if (flags & (flag)) { \ 4820 if (buf[0] != '\0') \ 4821 strlcat(buf, ", ", sizeof(buf)); \ 4822 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4823 flags &= ~(flag); \ 4824 } \ 4825 } while (0) 4826 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4827 MNT_KERN_FLAG(MNTK_ASYNC); 4828 MNT_KERN_FLAG(MNTK_SOFTDEP); 4829 MNT_KERN_FLAG(MNTK_NOMSYNC); 4830 MNT_KERN_FLAG(MNTK_DRAINING); 4831 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4832 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4833 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4834 MNT_KERN_FLAG(MNTK_NO_IOPF); 4835 MNT_KERN_FLAG(MNTK_RECURSE); 4836 MNT_KERN_FLAG(MNTK_UPPER_WAITER); 4837 MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); 4838 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4839 MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); 4840 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4841 MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); 4842 MNT_KERN_FLAG(MNTK_NOASYNC); 4843 MNT_KERN_FLAG(MNTK_UNMOUNT); 4844 MNT_KERN_FLAG(MNTK_MWAIT); 4845 MNT_KERN_FLAG(MNTK_SUSPEND); 4846 MNT_KERN_FLAG(MNTK_SUSPEND2); 4847 MNT_KERN_FLAG(MNTK_SUSPENDED); 4848 MNT_KERN_FLAG(MNTK_NULL_NOCACHE); 4849 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4850 #undef MNT_KERN_FLAG 4851 if (flags != 0) { 4852 if (buf[0] != '\0') 4853 strlcat(buf, ", ", sizeof(buf)); 4854 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4855 "0x%08x", flags); 4856 } 4857 db_printf(" mnt_kern_flag = %s\n", buf); 4858 4859 db_printf(" mnt_opt = "); 4860 opt = TAILQ_FIRST(mp->mnt_opt); 4861 if (opt != NULL) { 4862 db_printf("%s", opt->name); 4863 opt = TAILQ_NEXT(opt, link); 4864 while (opt != NULL) { 4865 db_printf(", %s", opt->name); 4866 opt = TAILQ_NEXT(opt, link); 4867 } 4868 } 4869 db_printf("\n"); 4870 4871 sp = &mp->mnt_stat; 4872 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4873 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4874 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4875 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4876 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4877 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4878 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4879 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4880 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4881 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4882 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4883 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4884 4885 db_printf(" mnt_cred = { uid=%u ruid=%u", 4886 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4887 if (jailed(mp->mnt_cred)) 4888 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4889 db_printf(" }\n"); 4890 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4891 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4892 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4893 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4894 db_printf(" mnt_lazyvnodelistsize = %d\n", 4895 mp->mnt_lazyvnodelistsize); 4896 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4897 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4898 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4899 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4900 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4901 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4902 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4903 db_printf(" mnt_secondary_accwrites = %d\n", 4904 mp->mnt_secondary_accwrites); 4905 db_printf(" mnt_gjprovider = %s\n", 4906 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4907 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4908 4909 db_printf("\n\nList of active vnodes\n"); 4910 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4911 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4912 vn_printf(vp, "vnode "); 4913 if (db_pager_quit) 4914 break; 4915 } 4916 } 4917 db_printf("\n\nList of inactive vnodes\n"); 4918 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4919 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4920 vn_printf(vp, "vnode "); 4921 if (db_pager_quit) 4922 break; 4923 } 4924 } 4925 } 4926 #endif /* DDB */ 4927 4928 /* 4929 * Fill in a struct xvfsconf based on a struct vfsconf. 4930 */ 4931 static int 4932 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4933 { 4934 struct xvfsconf xvfsp; 4935 4936 bzero(&xvfsp, sizeof(xvfsp)); 4937 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4938 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4939 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4940 xvfsp.vfc_flags = vfsp->vfc_flags; 4941 /* 4942 * These are unused in userland, we keep them 4943 * to not break binary compatibility. 4944 */ 4945 xvfsp.vfc_vfsops = NULL; 4946 xvfsp.vfc_next = NULL; 4947 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4948 } 4949 4950 #ifdef COMPAT_FREEBSD32 4951 struct xvfsconf32 { 4952 uint32_t vfc_vfsops; 4953 char vfc_name[MFSNAMELEN]; 4954 int32_t vfc_typenum; 4955 int32_t vfc_refcount; 4956 int32_t vfc_flags; 4957 uint32_t vfc_next; 4958 }; 4959 4960 static int 4961 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4962 { 4963 struct xvfsconf32 xvfsp; 4964 4965 bzero(&xvfsp, sizeof(xvfsp)); 4966 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4967 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4968 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4969 xvfsp.vfc_flags = vfsp->vfc_flags; 4970 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4971 } 4972 #endif 4973 4974 /* 4975 * Top level filesystem related information gathering. 4976 */ 4977 static int 4978 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4979 { 4980 struct vfsconf *vfsp; 4981 int error; 4982 4983 error = 0; 4984 vfsconf_slock(); 4985 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4986 #ifdef COMPAT_FREEBSD32 4987 if (req->flags & SCTL_MASK32) 4988 error = vfsconf2x32(req, vfsp); 4989 else 4990 #endif 4991 error = vfsconf2x(req, vfsp); 4992 if (error) 4993 break; 4994 } 4995 vfsconf_sunlock(); 4996 return (error); 4997 } 4998 4999 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 5000 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 5001 "S,xvfsconf", "List of all configured filesystems"); 5002 5003 #ifndef BURN_BRIDGES 5004 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 5005 5006 static int 5007 vfs_sysctl(SYSCTL_HANDLER_ARGS) 5008 { 5009 int *name = (int *)arg1 - 1; /* XXX */ 5010 u_int namelen = arg2 + 1; /* XXX */ 5011 struct vfsconf *vfsp; 5012 5013 log(LOG_WARNING, "userland calling deprecated sysctl, " 5014 "please rebuild world\n"); 5015 5016 #if 1 || defined(COMPAT_PRELITE2) 5017 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 5018 if (namelen == 1) 5019 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 5020 #endif 5021 5022 switch (name[1]) { 5023 case VFS_MAXTYPENUM: 5024 if (namelen != 2) 5025 return (ENOTDIR); 5026 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 5027 case VFS_CONF: 5028 if (namelen != 3) 5029 return (ENOTDIR); /* overloaded */ 5030 vfsconf_slock(); 5031 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 5032 if (vfsp->vfc_typenum == name[2]) 5033 break; 5034 } 5035 vfsconf_sunlock(); 5036 if (vfsp == NULL) 5037 return (EOPNOTSUPP); 5038 #ifdef COMPAT_FREEBSD32 5039 if (req->flags & SCTL_MASK32) 5040 return (vfsconf2x32(req, vfsp)); 5041 else 5042 #endif 5043 return (vfsconf2x(req, vfsp)); 5044 } 5045 return (EOPNOTSUPP); 5046 } 5047 5048 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 5049 CTLFLAG_MPSAFE, vfs_sysctl, 5050 "Generic filesystem"); 5051 5052 #if 1 || defined(COMPAT_PRELITE2) 5053 5054 static int 5055 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 5056 { 5057 int error; 5058 struct vfsconf *vfsp; 5059 struct ovfsconf ovfs; 5060 5061 vfsconf_slock(); 5062 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 5063 bzero(&ovfs, sizeof(ovfs)); 5064 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 5065 strcpy(ovfs.vfc_name, vfsp->vfc_name); 5066 ovfs.vfc_index = vfsp->vfc_typenum; 5067 ovfs.vfc_refcount = vfsp->vfc_refcount; 5068 ovfs.vfc_flags = vfsp->vfc_flags; 5069 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 5070 if (error != 0) { 5071 vfsconf_sunlock(); 5072 return (error); 5073 } 5074 } 5075 vfsconf_sunlock(); 5076 return (0); 5077 } 5078 5079 #endif /* 1 || COMPAT_PRELITE2 */ 5080 #endif /* !BURN_BRIDGES */ 5081 5082 static void 5083 unmount_or_warn(struct mount *mp) 5084 { 5085 int error; 5086 5087 error = dounmount(mp, MNT_FORCE, curthread); 5088 if (error != 0) { 5089 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 5090 if (error == EBUSY) 5091 printf("BUSY)\n"); 5092 else 5093 printf("%d)\n", error); 5094 } 5095 } 5096 5097 /* 5098 * Unmount all filesystems. The list is traversed in reverse order 5099 * of mounting to avoid dependencies. 5100 */ 5101 void 5102 vfs_unmountall(void) 5103 { 5104 struct mount *mp, *tmp; 5105 5106 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 5107 5108 /* 5109 * Since this only runs when rebooting, it is not interlocked. 5110 */ 5111 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 5112 vfs_ref(mp); 5113 5114 /* 5115 * Forcibly unmounting "/dev" before "/" would prevent clean 5116 * unmount of the latter. 5117 */ 5118 if (mp == rootdevmp) 5119 continue; 5120 5121 unmount_or_warn(mp); 5122 } 5123 5124 if (rootdevmp != NULL) 5125 unmount_or_warn(rootdevmp); 5126 } 5127 5128 static void 5129 vfs_deferred_inactive(struct vnode *vp, int lkflags) 5130 { 5131 5132 ASSERT_VI_LOCKED(vp, __func__); 5133 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 5134 if ((vp->v_iflag & VI_OWEINACT) == 0) { 5135 vdropl(vp); 5136 return; 5137 } 5138 if (vn_lock(vp, lkflags) == 0) { 5139 VI_LOCK(vp); 5140 vinactive(vp); 5141 VOP_UNLOCK(vp); 5142 vdropl(vp); 5143 return; 5144 } 5145 vdefer_inactive_unlocked(vp); 5146 } 5147 5148 static int 5149 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 5150 { 5151 5152 return (vp->v_iflag & VI_DEFINACT); 5153 } 5154 5155 static void __noinline 5156 vfs_periodic_inactive(struct mount *mp, int flags) 5157 { 5158 struct vnode *vp, *mvp; 5159 int lkflags; 5160 5161 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 5162 if (flags != MNT_WAIT) 5163 lkflags |= LK_NOWAIT; 5164 5165 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 5166 if ((vp->v_iflag & VI_DEFINACT) == 0) { 5167 VI_UNLOCK(vp); 5168 continue; 5169 } 5170 vp->v_iflag &= ~VI_DEFINACT; 5171 vfs_deferred_inactive(vp, lkflags); 5172 } 5173 } 5174 5175 static inline bool 5176 vfs_want_msync(struct vnode *vp) 5177 { 5178 struct vm_object *obj; 5179 5180 /* 5181 * This test may be performed without any locks held. 5182 * We rely on vm_object's type stability. 5183 */ 5184 if (vp->v_vflag & VV_NOSYNC) 5185 return (false); 5186 obj = vp->v_object; 5187 return (obj != NULL && vm_object_mightbedirty(obj)); 5188 } 5189 5190 static int 5191 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 5192 { 5193 5194 if (vp->v_vflag & VV_NOSYNC) 5195 return (false); 5196 if (vp->v_iflag & VI_DEFINACT) 5197 return (true); 5198 return (vfs_want_msync(vp)); 5199 } 5200 5201 static void __noinline 5202 vfs_periodic_msync_inactive(struct mount *mp, int flags) 5203 { 5204 struct vnode *vp, *mvp; 5205 int lkflags; 5206 bool seen_defer; 5207 5208 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 5209 if (flags != MNT_WAIT) 5210 lkflags |= LK_NOWAIT; 5211 5212 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 5213 seen_defer = false; 5214 if (vp->v_iflag & VI_DEFINACT) { 5215 vp->v_iflag &= ~VI_DEFINACT; 5216 seen_defer = true; 5217 } 5218 if (!vfs_want_msync(vp)) { 5219 if (seen_defer) 5220 vfs_deferred_inactive(vp, lkflags); 5221 else 5222 VI_UNLOCK(vp); 5223 continue; 5224 } 5225 if (vget(vp, lkflags) == 0) { 5226 if ((vp->v_vflag & VV_NOSYNC) == 0) { 5227 if (flags == MNT_WAIT) 5228 vnode_pager_clean_sync(vp); 5229 else 5230 vnode_pager_clean_async(vp); 5231 } 5232 vput(vp); 5233 if (seen_defer) 5234 vdrop(vp); 5235 } else { 5236 if (seen_defer) 5237 vdefer_inactive_unlocked(vp); 5238 } 5239 } 5240 } 5241 5242 void 5243 vfs_periodic(struct mount *mp, int flags) 5244 { 5245 5246 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 5247 5248 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 5249 vfs_periodic_inactive(mp, flags); 5250 else 5251 vfs_periodic_msync_inactive(mp, flags); 5252 } 5253 5254 static void 5255 destroy_vpollinfo_free(struct vpollinfo *vi) 5256 { 5257 5258 knlist_destroy(&vi->vpi_selinfo.si_note); 5259 mtx_destroy(&vi->vpi_lock); 5260 free(vi, M_VNODEPOLL); 5261 } 5262 5263 static void 5264 destroy_vpollinfo(struct vpollinfo *vi) 5265 { 5266 KASSERT(TAILQ_EMPTY(&vi->vpi_inotify), 5267 ("%s: pollinfo %p has lingering watches", __func__, vi)); 5268 knlist_clear(&vi->vpi_selinfo.si_note, 1); 5269 seldrain(&vi->vpi_selinfo); 5270 destroy_vpollinfo_free(vi); 5271 } 5272 5273 /* 5274 * Initialize per-vnode helper structure to hold poll-related state. 5275 */ 5276 void 5277 v_addpollinfo(struct vnode *vp) 5278 { 5279 struct vpollinfo *vi; 5280 5281 if (atomic_load_ptr(&vp->v_pollinfo) != NULL) 5282 return; 5283 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); 5284 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 5285 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 5286 vfs_knlunlock, vfs_knl_assert_lock); 5287 TAILQ_INIT(&vi->vpi_inotify); 5288 VI_LOCK(vp); 5289 if (vp->v_pollinfo != NULL) { 5290 VI_UNLOCK(vp); 5291 destroy_vpollinfo_free(vi); 5292 return; 5293 } 5294 vp->v_pollinfo = vi; 5295 VI_UNLOCK(vp); 5296 } 5297 5298 /* 5299 * Record a process's interest in events which might happen to 5300 * a vnode. Because poll uses the historic select-style interface 5301 * internally, this routine serves as both the ``check for any 5302 * pending events'' and the ``record my interest in future events'' 5303 * functions. (These are done together, while the lock is held, 5304 * to avoid race conditions.) 5305 */ 5306 int 5307 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 5308 { 5309 5310 v_addpollinfo(vp); 5311 mtx_lock(&vp->v_pollinfo->vpi_lock); 5312 if (vp->v_pollinfo->vpi_revents & events) { 5313 /* 5314 * This leaves events we are not interested 5315 * in available for the other process which 5316 * which presumably had requested them 5317 * (otherwise they would never have been 5318 * recorded). 5319 */ 5320 events &= vp->v_pollinfo->vpi_revents; 5321 vp->v_pollinfo->vpi_revents &= ~events; 5322 5323 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5324 return (events); 5325 } 5326 vp->v_pollinfo->vpi_events |= events; 5327 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 5328 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5329 return (0); 5330 } 5331 5332 /* 5333 * Routine to create and manage a filesystem syncer vnode. 5334 */ 5335 #define sync_close ((int (*)(struct vop_close_args *))nullop) 5336 static int sync_fsync(struct vop_fsync_args *); 5337 static int sync_inactive(struct vop_inactive_args *); 5338 static int sync_reclaim(struct vop_reclaim_args *); 5339 5340 static struct vop_vector sync_vnodeops = { 5341 .vop_bypass = VOP_EOPNOTSUPP, 5342 .vop_close = sync_close, 5343 .vop_fsync = sync_fsync, 5344 .vop_getwritemount = vop_stdgetwritemount, 5345 .vop_inactive = sync_inactive, 5346 .vop_need_inactive = vop_stdneed_inactive, 5347 .vop_reclaim = sync_reclaim, 5348 .vop_lock1 = vop_stdlock, 5349 .vop_unlock = vop_stdunlock, 5350 .vop_islocked = vop_stdislocked, 5351 .vop_fplookup_vexec = VOP_EAGAIN, 5352 .vop_fplookup_symlink = VOP_EAGAIN, 5353 }; 5354 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 5355 5356 /* 5357 * Create a new filesystem syncer vnode for the specified mount point. 5358 */ 5359 void 5360 vfs_allocate_syncvnode(struct mount *mp) 5361 { 5362 struct vnode *vp; 5363 struct bufobj *bo; 5364 static long start, incr, next; 5365 int error; 5366 5367 /* Allocate a new vnode */ 5368 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 5369 if (error != 0) 5370 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 5371 vp->v_type = VNON; 5372 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5373 vp->v_vflag |= VV_FORCEINSMQ; 5374 error = insmntque1(vp, mp); 5375 if (error != 0) 5376 panic("vfs_allocate_syncvnode: insmntque() failed"); 5377 vp->v_vflag &= ~VV_FORCEINSMQ; 5378 vn_set_state(vp, VSTATE_CONSTRUCTED); 5379 VOP_UNLOCK(vp); 5380 /* 5381 * Place the vnode onto the syncer worklist. We attempt to 5382 * scatter them about on the list so that they will go off 5383 * at evenly distributed times even if all the filesystems 5384 * are mounted at once. 5385 */ 5386 next += incr; 5387 if (next == 0 || next > syncer_maxdelay) { 5388 start /= 2; 5389 incr /= 2; 5390 if (start == 0) { 5391 start = syncer_maxdelay / 2; 5392 incr = syncer_maxdelay; 5393 } 5394 next = start; 5395 } 5396 bo = &vp->v_bufobj; 5397 BO_LOCK(bo); 5398 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5399 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5400 mtx_lock(&sync_mtx); 5401 sync_vnode_count++; 5402 if (mp->mnt_syncer == NULL) { 5403 mp->mnt_syncer = vp; 5404 vp = NULL; 5405 } 5406 mtx_unlock(&sync_mtx); 5407 BO_UNLOCK(bo); 5408 if (vp != NULL) { 5409 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5410 vgone(vp); 5411 vput(vp); 5412 } 5413 } 5414 5415 void 5416 vfs_deallocate_syncvnode(struct mount *mp) 5417 { 5418 struct vnode *vp; 5419 5420 mtx_lock(&sync_mtx); 5421 vp = mp->mnt_syncer; 5422 if (vp != NULL) 5423 mp->mnt_syncer = NULL; 5424 mtx_unlock(&sync_mtx); 5425 if (vp != NULL) 5426 vrele(vp); 5427 } 5428 5429 /* 5430 * Do a lazy sync of the filesystem. 5431 */ 5432 static int 5433 sync_fsync(struct vop_fsync_args *ap) 5434 { 5435 struct vnode *syncvp = ap->a_vp; 5436 struct mount *mp = syncvp->v_mount; 5437 int error, save; 5438 struct bufobj *bo; 5439 5440 /* 5441 * We only need to do something if this is a lazy evaluation. 5442 */ 5443 if (ap->a_waitfor != MNT_LAZY) 5444 return (0); 5445 5446 /* 5447 * Move ourselves to the back of the sync list. 5448 */ 5449 bo = &syncvp->v_bufobj; 5450 BO_LOCK(bo); 5451 vn_syncer_add_to_worklist(bo, syncdelay); 5452 BO_UNLOCK(bo); 5453 5454 /* 5455 * Walk the list of vnodes pushing all that are dirty and 5456 * not already on the sync list. 5457 */ 5458 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5459 return (0); 5460 VOP_UNLOCK(syncvp); 5461 save = curthread_pflags_set(TDP_SYNCIO); 5462 /* 5463 * The filesystem at hand may be idle with free vnodes stored in the 5464 * batch. Return them instead of letting them stay there indefinitely. 5465 */ 5466 vfs_periodic(mp, MNT_NOWAIT); 5467 error = VFS_SYNC(mp, MNT_LAZY); 5468 curthread_pflags_restore(save); 5469 vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); 5470 vfs_unbusy(mp); 5471 return (error); 5472 } 5473 5474 /* 5475 * The syncer vnode is no referenced. 5476 */ 5477 static int 5478 sync_inactive(struct vop_inactive_args *ap) 5479 { 5480 5481 vgone(ap->a_vp); 5482 return (0); 5483 } 5484 5485 /* 5486 * The syncer vnode is no longer needed and is being decommissioned. 5487 * 5488 * Modifications to the worklist must be protected by sync_mtx. 5489 */ 5490 static int 5491 sync_reclaim(struct vop_reclaim_args *ap) 5492 { 5493 struct vnode *vp = ap->a_vp; 5494 struct bufobj *bo; 5495 5496 bo = &vp->v_bufobj; 5497 BO_LOCK(bo); 5498 mtx_lock(&sync_mtx); 5499 if (vp->v_mount->mnt_syncer == vp) 5500 vp->v_mount->mnt_syncer = NULL; 5501 if (bo->bo_flag & BO_ONWORKLST) { 5502 LIST_REMOVE(bo, bo_synclist); 5503 syncer_worklist_len--; 5504 sync_vnode_count--; 5505 bo->bo_flag &= ~BO_ONWORKLST; 5506 } 5507 mtx_unlock(&sync_mtx); 5508 BO_UNLOCK(bo); 5509 5510 return (0); 5511 } 5512 5513 int 5514 vn_need_pageq_flush(struct vnode *vp) 5515 { 5516 struct vm_object *obj; 5517 5518 obj = vp->v_object; 5519 return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5520 vm_object_mightbedirty(obj)); 5521 } 5522 5523 /* 5524 * Check if vnode represents a disk device 5525 */ 5526 bool 5527 vn_isdisk_error(struct vnode *vp, int *errp) 5528 { 5529 int error; 5530 5531 if (vp->v_type != VCHR) { 5532 error = ENOTBLK; 5533 goto out; 5534 } 5535 error = 0; 5536 dev_lock(); 5537 if (vp->v_rdev == NULL) 5538 error = ENXIO; 5539 else if (vp->v_rdev->si_devsw == NULL) 5540 error = ENXIO; 5541 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5542 error = ENOTBLK; 5543 dev_unlock(); 5544 out: 5545 *errp = error; 5546 return (error == 0); 5547 } 5548 5549 bool 5550 vn_isdisk(struct vnode *vp) 5551 { 5552 int error; 5553 5554 return (vn_isdisk_error(vp, &error)); 5555 } 5556 5557 /* 5558 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5559 * the comment above cache_fplookup for details. 5560 */ 5561 int 5562 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5563 { 5564 int error; 5565 5566 VFS_SMR_ASSERT_ENTERED(); 5567 5568 /* Check the owner. */ 5569 if (cred->cr_uid == file_uid) { 5570 if (file_mode & S_IXUSR) 5571 return (0); 5572 goto out_error; 5573 } 5574 5575 /* Otherwise, check the groups (first match) */ 5576 if (groupmember(file_gid, cred)) { 5577 if (file_mode & S_IXGRP) 5578 return (0); 5579 goto out_error; 5580 } 5581 5582 /* Otherwise, check everyone else. */ 5583 if (file_mode & S_IXOTH) 5584 return (0); 5585 out_error: 5586 /* 5587 * Permission check failed, but it is possible denial will get overwritten 5588 * (e.g., when root is traversing through a 700 directory owned by someone 5589 * else). 5590 * 5591 * vaccess() calls priv_check_cred which in turn can descent into MAC 5592 * modules overriding this result. It's quite unclear what semantics 5593 * are allowed for them to operate, thus for safety we don't call them 5594 * from within the SMR section. This also means if any such modules 5595 * are present, we have to let the regular lookup decide. 5596 */ 5597 error = priv_check_cred_vfs_lookup_nomac(cred); 5598 switch (error) { 5599 case 0: 5600 return (0); 5601 case EAGAIN: 5602 /* 5603 * MAC modules present. 5604 */ 5605 return (EAGAIN); 5606 case EPERM: 5607 return (EACCES); 5608 default: 5609 return (error); 5610 } 5611 } 5612 5613 /* 5614 * Common filesystem object access control check routine. Accepts a 5615 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5616 * Returns 0 on success, or an errno on failure. 5617 */ 5618 int 5619 vaccess(__enum_uint8(vtype) type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5620 accmode_t accmode, struct ucred *cred) 5621 { 5622 accmode_t dac_granted; 5623 accmode_t priv_granted; 5624 5625 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5626 ("invalid bit in accmode")); 5627 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5628 ("VAPPEND without VWRITE")); 5629 5630 /* 5631 * Look for a normal, non-privileged way to access the file/directory 5632 * as requested. If it exists, go with that. 5633 */ 5634 5635 dac_granted = 0; 5636 5637 /* Check the owner. */ 5638 if (cred->cr_uid == file_uid) { 5639 dac_granted |= VADMIN; 5640 if (file_mode & S_IXUSR) 5641 dac_granted |= VEXEC; 5642 if (file_mode & S_IRUSR) 5643 dac_granted |= VREAD; 5644 if (file_mode & S_IWUSR) 5645 dac_granted |= (VWRITE | VAPPEND); 5646 5647 if ((accmode & dac_granted) == accmode) 5648 return (0); 5649 5650 goto privcheck; 5651 } 5652 5653 /* Otherwise, check the groups (first match) */ 5654 if (groupmember(file_gid, cred)) { 5655 if (file_mode & S_IXGRP) 5656 dac_granted |= VEXEC; 5657 if (file_mode & S_IRGRP) 5658 dac_granted |= VREAD; 5659 if (file_mode & S_IWGRP) 5660 dac_granted |= (VWRITE | VAPPEND); 5661 5662 if ((accmode & dac_granted) == accmode) 5663 return (0); 5664 5665 goto privcheck; 5666 } 5667 5668 /* Otherwise, check everyone else. */ 5669 if (file_mode & S_IXOTH) 5670 dac_granted |= VEXEC; 5671 if (file_mode & S_IROTH) 5672 dac_granted |= VREAD; 5673 if (file_mode & S_IWOTH) 5674 dac_granted |= (VWRITE | VAPPEND); 5675 if ((accmode & dac_granted) == accmode) 5676 return (0); 5677 5678 privcheck: 5679 /* 5680 * Build a privilege mask to determine if the set of privileges 5681 * satisfies the requirements when combined with the granted mask 5682 * from above. For each privilege, if the privilege is required, 5683 * bitwise or the request type onto the priv_granted mask. 5684 */ 5685 priv_granted = 0; 5686 5687 if (type == VDIR) { 5688 /* 5689 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5690 * requests, instead of PRIV_VFS_EXEC. 5691 */ 5692 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5693 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5694 priv_granted |= VEXEC; 5695 } else { 5696 /* 5697 * Ensure that at least one execute bit is on. Otherwise, 5698 * a privileged user will always succeed, and we don't want 5699 * this to happen unless the file really is executable. 5700 */ 5701 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5702 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5703 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5704 priv_granted |= VEXEC; 5705 } 5706 5707 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5708 !priv_check_cred(cred, PRIV_VFS_READ)) 5709 priv_granted |= VREAD; 5710 5711 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5712 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5713 priv_granted |= (VWRITE | VAPPEND); 5714 5715 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5716 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5717 priv_granted |= VADMIN; 5718 5719 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5720 return (0); 5721 } 5722 5723 return ((accmode & VADMIN) ? EPERM : EACCES); 5724 } 5725 5726 /* 5727 * Credential check based on process requesting service, and per-attribute 5728 * permissions. 5729 */ 5730 int 5731 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5732 struct thread *td, accmode_t accmode) 5733 { 5734 5735 /* 5736 * Kernel-invoked always succeeds. 5737 */ 5738 if (cred == NOCRED) 5739 return (0); 5740 5741 /* 5742 * Do not allow privileged processes in jail to directly manipulate 5743 * system attributes. 5744 */ 5745 switch (attrnamespace) { 5746 case EXTATTR_NAMESPACE_SYSTEM: 5747 /* Potentially should be: return (EPERM); */ 5748 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5749 case EXTATTR_NAMESPACE_USER: 5750 return (VOP_ACCESS(vp, accmode, cred, td)); 5751 default: 5752 return (EPERM); 5753 } 5754 } 5755 5756 #ifdef INVARIANTS 5757 void 5758 assert_vi_locked(struct vnode *vp, const char *str) 5759 { 5760 VNASSERT(mtx_owned(VI_MTX(vp)), vp, 5761 ("%s: vnode interlock is not locked but should be", str)); 5762 } 5763 5764 void 5765 assert_vi_unlocked(struct vnode *vp, const char *str) 5766 { 5767 VNASSERT(!mtx_owned(VI_MTX(vp)), vp, 5768 ("%s: vnode interlock is locked but should not be", str)); 5769 } 5770 5771 void 5772 assert_vop_locked(struct vnode *vp, const char *str) 5773 { 5774 bool locked; 5775 5776 if (KERNEL_PANICKED() || vp == NULL) 5777 return; 5778 5779 #ifdef WITNESS 5780 locked = !((vp->v_irflag & VIRF_CROSSMP) == 0 && 5781 witness_is_owned(&vp->v_vnlock->lock_object) == -1); 5782 #else 5783 int state = VOP_ISLOCKED(vp); 5784 locked = state != 0 && state != LK_EXCLOTHER; 5785 #endif 5786 VNASSERT(locked, vp, ("%s: vnode is not locked but should be", str)); 5787 } 5788 5789 void 5790 assert_vop_unlocked(struct vnode *vp, const char *str) 5791 { 5792 bool locked; 5793 5794 if (KERNEL_PANICKED() || vp == NULL) 5795 return; 5796 5797 #ifdef WITNESS 5798 locked = (vp->v_irflag & VIRF_CROSSMP) == 0 && 5799 witness_is_owned(&vp->v_vnlock->lock_object) == 1; 5800 #else 5801 locked = VOP_ISLOCKED(vp) == LK_EXCLUSIVE; 5802 #endif 5803 VNASSERT(!locked, vp, ("%s: vnode is locked but should not be", str)); 5804 } 5805 5806 void 5807 assert_vop_elocked(struct vnode *vp, const char *str) 5808 { 5809 bool locked; 5810 5811 if (KERNEL_PANICKED() || vp == NULL) 5812 return; 5813 5814 locked = VOP_ISLOCKED(vp) == LK_EXCLUSIVE; 5815 VNASSERT(locked, vp, 5816 ("%s: vnode is not exclusive locked but should be", str)); 5817 } 5818 #endif /* INVARIANTS */ 5819 5820 void 5821 vop_rename_fail(struct vop_rename_args *ap) 5822 { 5823 5824 if (ap->a_tvp != NULL) 5825 vput(ap->a_tvp); 5826 if (ap->a_tdvp == ap->a_tvp) 5827 vrele(ap->a_tdvp); 5828 else 5829 vput(ap->a_tdvp); 5830 vrele(ap->a_fdvp); 5831 vrele(ap->a_fvp); 5832 } 5833 5834 void 5835 vop_rename_pre(void *ap) 5836 { 5837 struct vop_rename_args *a = ap; 5838 5839 #ifdef INVARIANTS 5840 struct mount *tmp; 5841 5842 if (a->a_tvp) 5843 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5844 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5845 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5846 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5847 5848 /* Check the source (from). */ 5849 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5850 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5851 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5852 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5853 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5854 5855 /* Check the target. */ 5856 if (a->a_tvp) 5857 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5858 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5859 5860 tmp = NULL; 5861 VOP_GETWRITEMOUNT(a->a_tdvp, &tmp); 5862 lockmgr_assert(&tmp->mnt_renamelock, KA_XLOCKED); 5863 vfs_rel(tmp); 5864 #endif 5865 /* 5866 * It may be tempting to add vn_seqc_write_begin/end calls here and 5867 * in vop_rename_post but that's not going to work out since some 5868 * filesystems relookup vnodes mid-rename. This is probably a bug. 5869 * 5870 * For now filesystems are expected to do the relevant calls after they 5871 * decide what vnodes to operate on. 5872 */ 5873 if (a->a_tdvp != a->a_fdvp) 5874 vhold(a->a_fdvp); 5875 if (a->a_tvp != a->a_fvp) 5876 vhold(a->a_fvp); 5877 vhold(a->a_tdvp); 5878 if (a->a_tvp) 5879 vhold(a->a_tvp); 5880 } 5881 5882 #ifdef INVARIANTS 5883 void 5884 vop_fplookup_vexec_debugpre(void *ap __unused) 5885 { 5886 5887 VFS_SMR_ASSERT_ENTERED(); 5888 } 5889 5890 void 5891 vop_fplookup_vexec_debugpost(void *ap, int rc) 5892 { 5893 struct vop_fplookup_vexec_args *a; 5894 struct vnode *vp; 5895 5896 a = ap; 5897 vp = a->a_vp; 5898 5899 VFS_SMR_ASSERT_ENTERED(); 5900 if (rc == EOPNOTSUPP) 5901 VNPASS(VN_IS_DOOMED(vp), vp); 5902 } 5903 5904 void 5905 vop_fplookup_symlink_debugpre(void *ap __unused) 5906 { 5907 5908 VFS_SMR_ASSERT_ENTERED(); 5909 } 5910 5911 void 5912 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) 5913 { 5914 5915 VFS_SMR_ASSERT_ENTERED(); 5916 } 5917 5918 static void 5919 vop_fsync_debugprepost(struct vnode *vp, const char *name) 5920 { 5921 struct mount *mp; 5922 5923 if (vp->v_type == VCHR) 5924 ; 5925 /* 5926 * The shared vs. exclusive locking policy for fsync() 5927 * is actually determined by vp's write mount as indicated 5928 * by VOP_GETWRITEMOUNT(), which for stacked filesystems 5929 * may not be the same as vp->v_mount. However, if the 5930 * underlying filesystem which really handles the fsync() 5931 * supports shared locking, the stacked filesystem must also 5932 * be prepared for its VOP_FSYNC() operation to be called 5933 * with only a shared lock. On the other hand, if the 5934 * stacked filesystem claims support for shared write 5935 * locking but the underlying filesystem does not, and the 5936 * caller incorrectly uses a shared lock, this condition 5937 * should still be caught when the stacked filesystem 5938 * invokes VOP_FSYNC() on the underlying filesystem. 5939 */ 5940 else { 5941 mp = NULL; 5942 VOP_GETWRITEMOUNT(vp, &mp); 5943 if (vn_lktype_write(mp, vp) == LK_SHARED) 5944 ASSERT_VOP_LOCKED(vp, name); 5945 else 5946 ASSERT_VOP_ELOCKED(vp, name); 5947 if (mp != NULL) 5948 vfs_rel(mp); 5949 } 5950 } 5951 5952 void 5953 vop_fsync_debugpre(void *a) 5954 { 5955 struct vop_fsync_args *ap; 5956 5957 ap = a; 5958 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5959 } 5960 5961 void 5962 vop_fsync_debugpost(void *a, int rc __unused) 5963 { 5964 struct vop_fsync_args *ap; 5965 5966 ap = a; 5967 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5968 } 5969 5970 void 5971 vop_fdatasync_debugpre(void *a) 5972 { 5973 struct vop_fdatasync_args *ap; 5974 5975 ap = a; 5976 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5977 } 5978 5979 void 5980 vop_fdatasync_debugpost(void *a, int rc __unused) 5981 { 5982 struct vop_fdatasync_args *ap; 5983 5984 ap = a; 5985 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5986 } 5987 5988 void 5989 vop_strategy_debugpre(void *ap) 5990 { 5991 struct vop_strategy_args *a; 5992 struct buf *bp; 5993 5994 a = ap; 5995 bp = a->a_bp; 5996 5997 /* 5998 * Cluster ops lock their component buffers but not the IO container. 5999 */ 6000 if ((bp->b_flags & B_CLUSTER) != 0) 6001 return; 6002 6003 BUF_ASSERT_LOCKED(bp); 6004 } 6005 6006 void 6007 vop_lock_debugpre(void *ap) 6008 { 6009 struct vop_lock1_args *a = ap; 6010 6011 if ((a->a_flags & LK_INTERLOCK) == 0) 6012 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 6013 else 6014 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 6015 } 6016 6017 void 6018 vop_lock_debugpost(void *ap, int rc) 6019 { 6020 struct vop_lock1_args *a = ap; 6021 6022 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 6023 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 6024 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 6025 } 6026 6027 void 6028 vop_unlock_debugpre(void *ap) 6029 { 6030 struct vop_unlock_args *a = ap; 6031 struct vnode *vp = a->a_vp; 6032 6033 VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp); 6034 ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK"); 6035 } 6036 6037 void 6038 vop_need_inactive_debugpre(void *ap) 6039 { 6040 struct vop_need_inactive_args *a = ap; 6041 6042 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 6043 } 6044 6045 void 6046 vop_need_inactive_debugpost(void *ap, int rc) 6047 { 6048 struct vop_need_inactive_args *a = ap; 6049 6050 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 6051 } 6052 #endif /* INVARIANTS */ 6053 6054 void 6055 vop_allocate_post(void *ap, int rc) 6056 { 6057 struct vop_allocate_args *a; 6058 6059 a = ap; 6060 if (rc == 0) 6061 INOTIFY(a->a_vp, IN_MODIFY); 6062 } 6063 6064 void 6065 vop_copy_file_range_post(void *ap, int rc) 6066 { 6067 struct vop_copy_file_range_args *a; 6068 6069 a = ap; 6070 if (rc == 0) { 6071 INOTIFY(a->a_invp, IN_ACCESS); 6072 INOTIFY(a->a_outvp, IN_MODIFY); 6073 } 6074 } 6075 6076 void 6077 vop_create_pre(void *ap) 6078 { 6079 struct vop_create_args *a; 6080 struct vnode *dvp; 6081 6082 a = ap; 6083 dvp = a->a_dvp; 6084 vn_seqc_write_begin(dvp); 6085 } 6086 6087 void 6088 vop_create_post(void *ap, int rc) 6089 { 6090 struct vop_create_args *a; 6091 struct vnode *dvp; 6092 6093 a = ap; 6094 dvp = a->a_dvp; 6095 vn_seqc_write_end(dvp); 6096 if (!rc) { 6097 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6098 INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); 6099 } 6100 } 6101 6102 void 6103 vop_deallocate_post(void *ap, int rc) 6104 { 6105 struct vop_deallocate_args *a; 6106 6107 a = ap; 6108 if (rc == 0) 6109 INOTIFY(a->a_vp, IN_MODIFY); 6110 } 6111 6112 void 6113 vop_whiteout_pre(void *ap) 6114 { 6115 struct vop_whiteout_args *a; 6116 struct vnode *dvp; 6117 6118 a = ap; 6119 dvp = a->a_dvp; 6120 vn_seqc_write_begin(dvp); 6121 } 6122 6123 void 6124 vop_whiteout_post(void *ap, int rc) 6125 { 6126 struct vop_whiteout_args *a; 6127 struct vnode *dvp; 6128 6129 a = ap; 6130 dvp = a->a_dvp; 6131 vn_seqc_write_end(dvp); 6132 } 6133 6134 void 6135 vop_deleteextattr_pre(void *ap) 6136 { 6137 struct vop_deleteextattr_args *a; 6138 struct vnode *vp; 6139 6140 a = ap; 6141 vp = a->a_vp; 6142 vn_seqc_write_begin(vp); 6143 } 6144 6145 void 6146 vop_deleteextattr_post(void *ap, int rc) 6147 { 6148 struct vop_deleteextattr_args *a; 6149 struct vnode *vp; 6150 6151 a = ap; 6152 vp = a->a_vp; 6153 vn_seqc_write_end(vp); 6154 if (!rc) { 6155 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 6156 INOTIFY(vp, IN_ATTRIB); 6157 } 6158 } 6159 6160 void 6161 vop_link_pre(void *ap) 6162 { 6163 struct vop_link_args *a; 6164 struct vnode *vp, *tdvp; 6165 6166 a = ap; 6167 vp = a->a_vp; 6168 tdvp = a->a_tdvp; 6169 vn_seqc_write_begin(vp); 6170 vn_seqc_write_begin(tdvp); 6171 } 6172 6173 void 6174 vop_link_post(void *ap, int rc) 6175 { 6176 struct vop_link_args *a; 6177 struct vnode *vp, *tdvp; 6178 6179 a = ap; 6180 vp = a->a_vp; 6181 tdvp = a->a_tdvp; 6182 vn_seqc_write_end(vp); 6183 vn_seqc_write_end(tdvp); 6184 if (!rc) { 6185 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 6186 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 6187 INOTIFY_NAME(vp, tdvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT); 6188 INOTIFY_NAME(vp, tdvp, a->a_cnp, IN_CREATE); 6189 } 6190 } 6191 6192 void 6193 vop_mkdir_pre(void *ap) 6194 { 6195 struct vop_mkdir_args *a; 6196 struct vnode *dvp; 6197 6198 a = ap; 6199 dvp = a->a_dvp; 6200 vn_seqc_write_begin(dvp); 6201 } 6202 6203 void 6204 vop_mkdir_post(void *ap, int rc) 6205 { 6206 struct vop_mkdir_args *a; 6207 struct vnode *dvp; 6208 6209 a = ap; 6210 dvp = a->a_dvp; 6211 vn_seqc_write_end(dvp); 6212 if (!rc) { 6213 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6214 INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); 6215 } 6216 } 6217 6218 #ifdef INVARIANTS 6219 void 6220 vop_mkdir_debugpost(void *ap, int rc) 6221 { 6222 struct vop_mkdir_args *a; 6223 6224 a = ap; 6225 if (!rc) 6226 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); 6227 } 6228 #endif 6229 6230 void 6231 vop_mknod_pre(void *ap) 6232 { 6233 struct vop_mknod_args *a; 6234 struct vnode *dvp; 6235 6236 a = ap; 6237 dvp = a->a_dvp; 6238 vn_seqc_write_begin(dvp); 6239 } 6240 6241 void 6242 vop_mknod_post(void *ap, int rc) 6243 { 6244 struct vop_mknod_args *a; 6245 struct vnode *dvp; 6246 6247 a = ap; 6248 dvp = a->a_dvp; 6249 vn_seqc_write_end(dvp); 6250 if (!rc) { 6251 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6252 INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); 6253 } 6254 } 6255 6256 void 6257 vop_reclaim_post(void *ap, int rc) 6258 { 6259 struct vop_reclaim_args *a; 6260 struct vnode *vp; 6261 6262 a = ap; 6263 vp = a->a_vp; 6264 ASSERT_VOP_IN_SEQC(vp); 6265 if (!rc) { 6266 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 6267 INOTIFY_REVOKE(vp); 6268 } 6269 } 6270 6271 void 6272 vop_remove_pre(void *ap) 6273 { 6274 struct vop_remove_args *a; 6275 struct vnode *dvp, *vp; 6276 6277 a = ap; 6278 dvp = a->a_dvp; 6279 vp = a->a_vp; 6280 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); 6281 vn_seqc_write_begin(dvp); 6282 vn_seqc_write_begin(vp); 6283 } 6284 6285 void 6286 vop_remove_post(void *ap, int rc) 6287 { 6288 struct vop_remove_args *a; 6289 struct vnode *dvp, *vp; 6290 6291 a = ap; 6292 dvp = a->a_dvp; 6293 vp = a->a_vp; 6294 vn_seqc_write_end(dvp); 6295 vn_seqc_write_end(vp); 6296 if (!rc) { 6297 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6298 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6299 INOTIFY_NAME(vp, dvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT); 6300 INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE); 6301 } 6302 } 6303 6304 void 6305 vop_rename_post(void *ap, int rc) 6306 { 6307 struct vop_rename_args *a = ap; 6308 long hint; 6309 6310 if (!rc) { 6311 hint = NOTE_WRITE; 6312 if (a->a_fdvp == a->a_tdvp) { 6313 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 6314 hint |= NOTE_LINK; 6315 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 6316 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 6317 } else { 6318 hint |= NOTE_EXTEND; 6319 if (a->a_fvp->v_type == VDIR) 6320 hint |= NOTE_LINK; 6321 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 6322 6323 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 6324 a->a_tvp->v_type == VDIR) 6325 hint &= ~NOTE_LINK; 6326 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 6327 } 6328 6329 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 6330 if (a->a_tvp) 6331 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 6332 INOTIFY_MOVE(a->a_fvp, a->a_fdvp, a->a_fcnp, a->a_tvp, 6333 a->a_tdvp, a->a_tcnp); 6334 } 6335 if (a->a_tdvp != a->a_fdvp) 6336 vdrop(a->a_fdvp); 6337 if (a->a_tvp != a->a_fvp) 6338 vdrop(a->a_fvp); 6339 vdrop(a->a_tdvp); 6340 if (a->a_tvp) 6341 vdrop(a->a_tvp); 6342 } 6343 6344 void 6345 vop_rmdir_pre(void *ap) 6346 { 6347 struct vop_rmdir_args *a; 6348 struct vnode *dvp, *vp; 6349 6350 a = ap; 6351 dvp = a->a_dvp; 6352 vp = a->a_vp; 6353 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); 6354 vn_seqc_write_begin(dvp); 6355 vn_seqc_write_begin(vp); 6356 } 6357 6358 void 6359 vop_rmdir_post(void *ap, int rc) 6360 { 6361 struct vop_rmdir_args *a; 6362 struct vnode *dvp, *vp; 6363 6364 a = ap; 6365 dvp = a->a_dvp; 6366 vp = a->a_vp; 6367 vn_seqc_write_end(dvp); 6368 vn_seqc_write_end(vp); 6369 if (!rc) { 6370 vp->v_vflag |= VV_UNLINKED; 6371 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6372 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6373 INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE); 6374 } 6375 } 6376 6377 void 6378 vop_setattr_pre(void *ap) 6379 { 6380 struct vop_setattr_args *a; 6381 struct vnode *vp; 6382 6383 a = ap; 6384 vp = a->a_vp; 6385 vn_seqc_write_begin(vp); 6386 } 6387 6388 void 6389 vop_setattr_post(void *ap, int rc) 6390 { 6391 struct vop_setattr_args *a; 6392 struct vnode *vp; 6393 6394 a = ap; 6395 vp = a->a_vp; 6396 vn_seqc_write_end(vp); 6397 if (!rc) { 6398 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6399 INOTIFY(vp, IN_ATTRIB); 6400 } 6401 } 6402 6403 void 6404 vop_setacl_pre(void *ap) 6405 { 6406 struct vop_setacl_args *a; 6407 struct vnode *vp; 6408 6409 a = ap; 6410 vp = a->a_vp; 6411 vn_seqc_write_begin(vp); 6412 } 6413 6414 void 6415 vop_setacl_post(void *ap, int rc __unused) 6416 { 6417 struct vop_setacl_args *a; 6418 struct vnode *vp; 6419 6420 a = ap; 6421 vp = a->a_vp; 6422 vn_seqc_write_end(vp); 6423 } 6424 6425 void 6426 vop_setextattr_pre(void *ap) 6427 { 6428 struct vop_setextattr_args *a; 6429 struct vnode *vp; 6430 6431 a = ap; 6432 vp = a->a_vp; 6433 vn_seqc_write_begin(vp); 6434 } 6435 6436 void 6437 vop_setextattr_post(void *ap, int rc) 6438 { 6439 struct vop_setextattr_args *a; 6440 struct vnode *vp; 6441 6442 a = ap; 6443 vp = a->a_vp; 6444 vn_seqc_write_end(vp); 6445 if (!rc) { 6446 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6447 INOTIFY(vp, IN_ATTRIB); 6448 } 6449 } 6450 6451 void 6452 vop_symlink_pre(void *ap) 6453 { 6454 struct vop_symlink_args *a; 6455 struct vnode *dvp; 6456 6457 a = ap; 6458 dvp = a->a_dvp; 6459 vn_seqc_write_begin(dvp); 6460 } 6461 6462 void 6463 vop_symlink_post(void *ap, int rc) 6464 { 6465 struct vop_symlink_args *a; 6466 struct vnode *dvp; 6467 6468 a = ap; 6469 dvp = a->a_dvp; 6470 vn_seqc_write_end(dvp); 6471 if (!rc) { 6472 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6473 INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); 6474 } 6475 } 6476 6477 void 6478 vop_open_post(void *ap, int rc) 6479 { 6480 struct vop_open_args *a = ap; 6481 6482 if (!rc) { 6483 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6484 INOTIFY(a->a_vp, IN_OPEN); 6485 } 6486 } 6487 6488 void 6489 vop_close_post(void *ap, int rc) 6490 { 6491 struct vop_close_args *a = ap; 6492 6493 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6494 !VN_IS_DOOMED(a->a_vp))) { 6495 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6496 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6497 INOTIFY(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6498 IN_CLOSE_WRITE : IN_CLOSE_NOWRITE); 6499 } 6500 } 6501 6502 void 6503 vop_read_post(void *ap, int rc) 6504 { 6505 struct vop_read_args *a = ap; 6506 6507 if (!rc) { 6508 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6509 INOTIFY(a->a_vp, IN_ACCESS); 6510 } 6511 } 6512 6513 void 6514 vop_read_pgcache_post(void *ap, int rc) 6515 { 6516 struct vop_read_pgcache_args *a = ap; 6517 6518 if (!rc) 6519 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); 6520 } 6521 6522 static struct knlist fs_knlist; 6523 6524 static void 6525 vfs_event_init(void *arg) 6526 { 6527 knlist_init_mtx(&fs_knlist, NULL); 6528 } 6529 /* XXX - correct order? */ 6530 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6531 6532 void 6533 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6534 { 6535 6536 KNOTE_UNLOCKED(&fs_knlist, event); 6537 } 6538 6539 static int filt_fsattach(struct knote *kn); 6540 static void filt_fsdetach(struct knote *kn); 6541 static int filt_fsevent(struct knote *kn, long hint); 6542 6543 const struct filterops fs_filtops = { 6544 .f_isfd = 0, 6545 .f_attach = filt_fsattach, 6546 .f_detach = filt_fsdetach, 6547 .f_event = filt_fsevent, 6548 }; 6549 6550 static int 6551 filt_fsattach(struct knote *kn) 6552 { 6553 6554 kn->kn_flags |= EV_CLEAR; 6555 knlist_add(&fs_knlist, kn, 0); 6556 return (0); 6557 } 6558 6559 static void 6560 filt_fsdetach(struct knote *kn) 6561 { 6562 6563 knlist_remove(&fs_knlist, kn, 0); 6564 } 6565 6566 static int 6567 filt_fsevent(struct knote *kn, long hint) 6568 { 6569 6570 kn->kn_fflags |= kn->kn_sfflags & hint; 6571 6572 return (kn->kn_fflags != 0); 6573 } 6574 6575 static int 6576 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6577 { 6578 struct vfsidctl vc; 6579 int error; 6580 struct mount *mp; 6581 6582 if (req->newptr == NULL) 6583 return (EINVAL); 6584 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6585 if (error) 6586 return (error); 6587 if (vc.vc_vers != VFS_CTL_VERS1) 6588 return (EINVAL); 6589 mp = vfs_getvfs(&vc.vc_fsid); 6590 if (mp == NULL) 6591 return (ENOENT); 6592 /* ensure that a specific sysctl goes to the right filesystem. */ 6593 if (strcmp(vc.vc_fstypename, "*") != 0 && 6594 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6595 vfs_rel(mp); 6596 return (EINVAL); 6597 } 6598 VCTLTOREQ(&vc, req); 6599 error = VFS_SYSCTL(mp, vc.vc_op, req); 6600 vfs_rel(mp); 6601 return (error); 6602 } 6603 6604 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6605 NULL, 0, sysctl_vfs_ctl, "", 6606 "Sysctl by fsid"); 6607 6608 /* 6609 * Function to initialize a va_filerev field sensibly. 6610 * XXX: Wouldn't a random number make a lot more sense ?? 6611 */ 6612 u_quad_t 6613 init_va_filerev(void) 6614 { 6615 struct bintime bt; 6616 6617 getbinuptime(&bt); 6618 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6619 } 6620 6621 static int filt_vfsread(struct knote *kn, long hint); 6622 static int filt_vfswrite(struct knote *kn, long hint); 6623 static int filt_vfsvnode(struct knote *kn, long hint); 6624 static void filt_vfsdetach(struct knote *kn); 6625 static int filt_vfsdump(struct proc *p, struct knote *kn, 6626 struct kinfo_knote *kin); 6627 6628 static const struct filterops vfsread_filtops = { 6629 .f_isfd = 1, 6630 .f_detach = filt_vfsdetach, 6631 .f_event = filt_vfsread, 6632 .f_userdump = filt_vfsdump, 6633 }; 6634 static const struct filterops vfswrite_filtops = { 6635 .f_isfd = 1, 6636 .f_detach = filt_vfsdetach, 6637 .f_event = filt_vfswrite, 6638 .f_userdump = filt_vfsdump, 6639 }; 6640 static const struct filterops vfsvnode_filtops = { 6641 .f_isfd = 1, 6642 .f_detach = filt_vfsdetach, 6643 .f_event = filt_vfsvnode, 6644 .f_userdump = filt_vfsdump, 6645 }; 6646 6647 static void 6648 vfs_knllock(void *arg) 6649 { 6650 struct vnode *vp = arg; 6651 6652 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6653 } 6654 6655 static void 6656 vfs_knlunlock(void *arg) 6657 { 6658 struct vnode *vp = arg; 6659 6660 VOP_UNLOCK(vp); 6661 } 6662 6663 static void 6664 vfs_knl_assert_lock(void *arg, int what) 6665 { 6666 #ifdef INVARIANTS 6667 struct vnode *vp = arg; 6668 6669 if (what == LA_LOCKED) 6670 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6671 else 6672 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6673 #endif 6674 } 6675 6676 int 6677 vfs_kqfilter(struct vop_kqfilter_args *ap) 6678 { 6679 struct vnode *vp = ap->a_vp; 6680 struct knote *kn = ap->a_kn; 6681 struct knlist *knl; 6682 6683 KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && 6684 kn->kn_filter != EVFILT_WRITE), 6685 ("READ/WRITE filter on a FIFO leaked through")); 6686 switch (kn->kn_filter) { 6687 case EVFILT_READ: 6688 kn->kn_fop = &vfsread_filtops; 6689 break; 6690 case EVFILT_WRITE: 6691 kn->kn_fop = &vfswrite_filtops; 6692 break; 6693 case EVFILT_VNODE: 6694 kn->kn_fop = &vfsvnode_filtops; 6695 break; 6696 default: 6697 return (EINVAL); 6698 } 6699 6700 kn->kn_hook = (caddr_t)vp; 6701 6702 v_addpollinfo(vp); 6703 if (vp->v_pollinfo == NULL) 6704 return (ENOMEM); 6705 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6706 vhold(vp); 6707 knlist_add(knl, kn, 0); 6708 6709 return (0); 6710 } 6711 6712 /* 6713 * Detach knote from vnode 6714 */ 6715 static void 6716 filt_vfsdetach(struct knote *kn) 6717 { 6718 struct vnode *vp = (struct vnode *)kn->kn_hook; 6719 6720 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6721 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6722 vdrop(vp); 6723 } 6724 6725 /*ARGSUSED*/ 6726 static int 6727 filt_vfsread(struct knote *kn, long hint) 6728 { 6729 struct vnode *vp = (struct vnode *)kn->kn_hook; 6730 off_t size; 6731 int res; 6732 6733 /* 6734 * filesystem is gone, so set the EOF flag and schedule 6735 * the knote for deletion. 6736 */ 6737 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6738 VI_LOCK(vp); 6739 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6740 VI_UNLOCK(vp); 6741 return (1); 6742 } 6743 6744 if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0) 6745 return (0); 6746 6747 VI_LOCK(vp); 6748 kn->kn_data = size - kn->kn_fp->f_offset; 6749 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6750 VI_UNLOCK(vp); 6751 return (res); 6752 } 6753 6754 /*ARGSUSED*/ 6755 static int 6756 filt_vfswrite(struct knote *kn, long hint) 6757 { 6758 struct vnode *vp = (struct vnode *)kn->kn_hook; 6759 6760 VI_LOCK(vp); 6761 6762 /* 6763 * filesystem is gone, so set the EOF flag and schedule 6764 * the knote for deletion. 6765 */ 6766 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6767 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6768 6769 kn->kn_data = 0; 6770 VI_UNLOCK(vp); 6771 return (1); 6772 } 6773 6774 static int 6775 filt_vfsvnode(struct knote *kn, long hint) 6776 { 6777 struct vnode *vp = (struct vnode *)kn->kn_hook; 6778 int res; 6779 6780 VI_LOCK(vp); 6781 if (kn->kn_sfflags & hint) 6782 kn->kn_fflags |= hint; 6783 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6784 kn->kn_flags |= EV_EOF; 6785 VI_UNLOCK(vp); 6786 return (1); 6787 } 6788 res = (kn->kn_fflags != 0); 6789 VI_UNLOCK(vp); 6790 return (res); 6791 } 6792 6793 static int 6794 filt_vfsdump(struct proc *p, struct knote *kn, struct kinfo_knote *kin) 6795 { 6796 struct vattr va; 6797 struct vnode *vp; 6798 char *fullpath, *freepath; 6799 int error; 6800 6801 kin->knt_extdata = KNOTE_EXTDATA_VNODE; 6802 6803 vp = kn->kn_fp->f_vnode; 6804 kin->knt_vnode.knt_vnode_type = vntype_to_kinfo(vp->v_type); 6805 6806 va.va_fsid = VNOVAL; 6807 vn_lock(vp, LK_SHARED | LK_RETRY); 6808 error = VOP_GETATTR(vp, &va, curthread->td_ucred); 6809 VOP_UNLOCK(vp); 6810 if (error != 0) 6811 return (error); 6812 kin->knt_vnode.knt_vnode_fsid = va.va_fsid; 6813 kin->knt_vnode.knt_vnode_fileid = va.va_fileid; 6814 6815 freepath = NULL; 6816 fullpath = "-"; 6817 error = vn_fullpath(vp, &fullpath, &freepath); 6818 if (error == 0) { 6819 strlcpy(kin->knt_vnode.knt_vnode_fullpath, fullpath, 6820 sizeof(kin->knt_vnode.knt_vnode_fullpath)); 6821 } 6822 if (freepath != NULL) 6823 free(freepath, M_TEMP); 6824 6825 return (0); 6826 } 6827 6828 int 6829 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6830 { 6831 int error; 6832 6833 if (dp->d_reclen > ap->a_uio->uio_resid) 6834 return (ENAMETOOLONG); 6835 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6836 if (error) { 6837 if (ap->a_ncookies != NULL) { 6838 if (ap->a_cookies != NULL) 6839 free(ap->a_cookies, M_TEMP); 6840 ap->a_cookies = NULL; 6841 *ap->a_ncookies = 0; 6842 } 6843 return (error); 6844 } 6845 if (ap->a_ncookies == NULL) 6846 return (0); 6847 6848 KASSERT(ap->a_cookies, 6849 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6850 6851 *ap->a_cookies = realloc(*ap->a_cookies, 6852 (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); 6853 (*ap->a_cookies)[*ap->a_ncookies] = off; 6854 *ap->a_ncookies += 1; 6855 return (0); 6856 } 6857 6858 /* 6859 * The purpose of this routine is to remove granularity from accmode_t, 6860 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6861 * VADMIN and VAPPEND. 6862 * 6863 * If it returns 0, the caller is supposed to continue with the usual 6864 * access checks using 'accmode' as modified by this routine. If it 6865 * returns nonzero value, the caller is supposed to return that value 6866 * as errno. 6867 * 6868 * Note that after this routine runs, accmode may be zero. 6869 */ 6870 int 6871 vfs_unixify_accmode(accmode_t *accmode) 6872 { 6873 /* 6874 * There is no way to specify explicit "deny" rule using 6875 * file mode or POSIX.1e ACLs. 6876 */ 6877 if (*accmode & VEXPLICIT_DENY) { 6878 *accmode = 0; 6879 return (0); 6880 } 6881 6882 /* 6883 * None of these can be translated into usual access bits. 6884 * Also, the common case for NFSv4 ACLs is to not contain 6885 * either of these bits. Caller should check for VWRITE 6886 * on the containing directory instead. 6887 */ 6888 if (*accmode & (VDELETE_CHILD | VDELETE)) 6889 return (EPERM); 6890 6891 if (*accmode & VADMIN_PERMS) { 6892 *accmode &= ~VADMIN_PERMS; 6893 *accmode |= VADMIN; 6894 } 6895 6896 /* 6897 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6898 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6899 */ 6900 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6901 6902 return (0); 6903 } 6904 6905 /* 6906 * Clear out a doomed vnode (if any) and replace it with a new one as long 6907 * as the fs is not being unmounted. Return the root vnode to the caller. 6908 */ 6909 static int __noinline 6910 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6911 { 6912 struct vnode *vp; 6913 int error; 6914 6915 restart: 6916 if (mp->mnt_rootvnode != NULL) { 6917 MNT_ILOCK(mp); 6918 vp = mp->mnt_rootvnode; 6919 if (vp != NULL) { 6920 if (!VN_IS_DOOMED(vp)) { 6921 vrefact(vp); 6922 MNT_IUNLOCK(mp); 6923 error = vn_lock(vp, flags); 6924 if (error == 0) { 6925 *vpp = vp; 6926 return (0); 6927 } 6928 vrele(vp); 6929 goto restart; 6930 } 6931 /* 6932 * Clear the old one. 6933 */ 6934 mp->mnt_rootvnode = NULL; 6935 } 6936 MNT_IUNLOCK(mp); 6937 if (vp != NULL) { 6938 vfs_op_barrier_wait(mp); 6939 vrele(vp); 6940 } 6941 } 6942 error = VFS_CACHEDROOT(mp, flags, vpp); 6943 if (error != 0) 6944 return (error); 6945 if (mp->mnt_vfs_ops == 0) { 6946 MNT_ILOCK(mp); 6947 if (mp->mnt_vfs_ops != 0) { 6948 MNT_IUNLOCK(mp); 6949 return (0); 6950 } 6951 if (mp->mnt_rootvnode == NULL) { 6952 vrefact(*vpp); 6953 mp->mnt_rootvnode = *vpp; 6954 } else { 6955 if (mp->mnt_rootvnode != *vpp) { 6956 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6957 panic("%s: mismatch between vnode returned " 6958 " by VFS_CACHEDROOT and the one cached " 6959 " (%p != %p)", 6960 __func__, *vpp, mp->mnt_rootvnode); 6961 } 6962 } 6963 } 6964 MNT_IUNLOCK(mp); 6965 } 6966 return (0); 6967 } 6968 6969 int 6970 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6971 { 6972 struct mount_pcpu *mpcpu; 6973 struct vnode *vp; 6974 int error; 6975 6976 if (!vfs_op_thread_enter(mp, mpcpu)) 6977 return (vfs_cache_root_fallback(mp, flags, vpp)); 6978 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6979 if (vp == NULL || VN_IS_DOOMED(vp)) { 6980 vfs_op_thread_exit(mp, mpcpu); 6981 return (vfs_cache_root_fallback(mp, flags, vpp)); 6982 } 6983 vrefact(vp); 6984 vfs_op_thread_exit(mp, mpcpu); 6985 error = vn_lock(vp, flags); 6986 if (error != 0) { 6987 vrele(vp); 6988 return (vfs_cache_root_fallback(mp, flags, vpp)); 6989 } 6990 *vpp = vp; 6991 return (0); 6992 } 6993 6994 struct vnode * 6995 vfs_cache_root_clear(struct mount *mp) 6996 { 6997 struct vnode *vp; 6998 6999 /* 7000 * ops > 0 guarantees there is nobody who can see this vnode 7001 */ 7002 MPASS(mp->mnt_vfs_ops > 0); 7003 vp = mp->mnt_rootvnode; 7004 if (vp != NULL) 7005 vn_seqc_write_begin(vp); 7006 mp->mnt_rootvnode = NULL; 7007 return (vp); 7008 } 7009 7010 void 7011 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 7012 { 7013 7014 MPASS(mp->mnt_vfs_ops > 0); 7015 vrefact(vp); 7016 mp->mnt_rootvnode = vp; 7017 } 7018 7019 /* 7020 * These are helper functions for filesystems to traverse all 7021 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 7022 * 7023 * This interface replaces MNT_VNODE_FOREACH. 7024 */ 7025 7026 struct vnode * 7027 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 7028 { 7029 struct vnode *vp; 7030 7031 maybe_yield(); 7032 MNT_ILOCK(mp); 7033 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7034 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 7035 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 7036 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 7037 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 7038 continue; 7039 VI_LOCK(vp); 7040 if (VN_IS_DOOMED(vp)) { 7041 VI_UNLOCK(vp); 7042 continue; 7043 } 7044 break; 7045 } 7046 if (vp == NULL) { 7047 __mnt_vnode_markerfree_all(mvp, mp); 7048 /* MNT_IUNLOCK(mp); -- done in above function */ 7049 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 7050 return (NULL); 7051 } 7052 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 7053 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 7054 MNT_IUNLOCK(mp); 7055 return (vp); 7056 } 7057 7058 struct vnode * 7059 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 7060 { 7061 struct vnode *vp; 7062 7063 *mvp = vn_alloc_marker(mp); 7064 MNT_ILOCK(mp); 7065 MNT_REF(mp); 7066 7067 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 7068 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 7069 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 7070 continue; 7071 VI_LOCK(vp); 7072 if (VN_IS_DOOMED(vp)) { 7073 VI_UNLOCK(vp); 7074 continue; 7075 } 7076 break; 7077 } 7078 if (vp == NULL) { 7079 MNT_REL(mp); 7080 MNT_IUNLOCK(mp); 7081 vn_free_marker(*mvp); 7082 *mvp = NULL; 7083 return (NULL); 7084 } 7085 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 7086 MNT_IUNLOCK(mp); 7087 return (vp); 7088 } 7089 7090 void 7091 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 7092 { 7093 7094 if (*mvp == NULL) { 7095 MNT_IUNLOCK(mp); 7096 return; 7097 } 7098 7099 mtx_assert(MNT_MTX(mp), MA_OWNED); 7100 7101 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7102 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 7103 MNT_REL(mp); 7104 MNT_IUNLOCK(mp); 7105 vn_free_marker(*mvp); 7106 *mvp = NULL; 7107 } 7108 7109 /* 7110 * These are helper functions for filesystems to traverse their 7111 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 7112 */ 7113 static void 7114 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 7115 { 7116 7117 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7118 7119 MNT_ILOCK(mp); 7120 MNT_REL(mp); 7121 MNT_IUNLOCK(mp); 7122 vn_free_marker(*mvp); 7123 *mvp = NULL; 7124 } 7125 7126 /* 7127 * Relock the mp mount vnode list lock with the vp vnode interlock in the 7128 * conventional lock order during mnt_vnode_next_lazy iteration. 7129 * 7130 * On entry, the mount vnode list lock is held and the vnode interlock is not. 7131 * The list lock is dropped and reacquired. On success, both locks are held. 7132 * On failure, the mount vnode list lock is held but the vnode interlock is 7133 * not, and the procedure may have yielded. 7134 */ 7135 static bool 7136 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 7137 struct vnode *vp) 7138 { 7139 7140 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 7141 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 7142 ("%s: bad marker", __func__)); 7143 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 7144 ("%s: inappropriate vnode", __func__)); 7145 ASSERT_VI_UNLOCKED(vp, __func__); 7146 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 7147 7148 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 7149 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 7150 7151 /* 7152 * Note we may be racing against vdrop which transitioned the hold 7153 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 7154 * if we are the only user after we get the interlock we will just 7155 * vdrop. 7156 */ 7157 vhold(vp); 7158 mtx_unlock(&mp->mnt_listmtx); 7159 VI_LOCK(vp); 7160 if (VN_IS_DOOMED(vp)) { 7161 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 7162 goto out_lost; 7163 } 7164 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 7165 /* 7166 * There is nothing to do if we are the last user. 7167 */ 7168 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 7169 goto out_lost; 7170 mtx_lock(&mp->mnt_listmtx); 7171 return (true); 7172 out_lost: 7173 vdropl(vp); 7174 maybe_yield(); 7175 mtx_lock(&mp->mnt_listmtx); 7176 return (false); 7177 } 7178 7179 static struct vnode * 7180 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7181 void *cbarg) 7182 { 7183 struct vnode *vp; 7184 7185 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 7186 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7187 restart: 7188 vp = TAILQ_NEXT(*mvp, v_lazylist); 7189 while (vp != NULL) { 7190 if (vp->v_type == VMARKER) { 7191 vp = TAILQ_NEXT(vp, v_lazylist); 7192 continue; 7193 } 7194 /* 7195 * See if we want to process the vnode. Note we may encounter a 7196 * long string of vnodes we don't care about and hog the list 7197 * as a result. Check for it and requeue the marker. 7198 */ 7199 VNPASS(!VN_IS_DOOMED(vp), vp); 7200 if (!cb(vp, cbarg)) { 7201 if (!should_yield()) { 7202 vp = TAILQ_NEXT(vp, v_lazylist); 7203 continue; 7204 } 7205 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 7206 v_lazylist); 7207 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 7208 v_lazylist); 7209 mtx_unlock(&mp->mnt_listmtx); 7210 kern_yield(PRI_USER); 7211 mtx_lock(&mp->mnt_listmtx); 7212 goto restart; 7213 } 7214 /* 7215 * Try-lock because this is the wrong lock order. 7216 */ 7217 if (!VI_TRYLOCK(vp) && 7218 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 7219 goto restart; 7220 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 7221 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 7222 ("alien vnode on the lazy list %p %p", vp, mp)); 7223 VNPASS(vp->v_mount == mp, vp); 7224 VNPASS(!VN_IS_DOOMED(vp), vp); 7225 break; 7226 } 7227 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 7228 7229 /* Check if we are done */ 7230 if (vp == NULL) { 7231 mtx_unlock(&mp->mnt_listmtx); 7232 mnt_vnode_markerfree_lazy(mvp, mp); 7233 return (NULL); 7234 } 7235 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 7236 mtx_unlock(&mp->mnt_listmtx); 7237 ASSERT_VI_LOCKED(vp, "lazy iter"); 7238 return (vp); 7239 } 7240 7241 struct vnode * 7242 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7243 void *cbarg) 7244 { 7245 7246 maybe_yield(); 7247 mtx_lock(&mp->mnt_listmtx); 7248 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 7249 } 7250 7251 struct vnode * 7252 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7253 void *cbarg) 7254 { 7255 struct vnode *vp; 7256 7257 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 7258 return (NULL); 7259 7260 *mvp = vn_alloc_marker(mp); 7261 MNT_ILOCK(mp); 7262 MNT_REF(mp); 7263 MNT_IUNLOCK(mp); 7264 7265 mtx_lock(&mp->mnt_listmtx); 7266 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 7267 if (vp == NULL) { 7268 mtx_unlock(&mp->mnt_listmtx); 7269 mnt_vnode_markerfree_lazy(mvp, mp); 7270 return (NULL); 7271 } 7272 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 7273 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 7274 } 7275 7276 void 7277 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 7278 { 7279 7280 if (*mvp == NULL) 7281 return; 7282 7283 mtx_lock(&mp->mnt_listmtx); 7284 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 7285 mtx_unlock(&mp->mnt_listmtx); 7286 mnt_vnode_markerfree_lazy(mvp, mp); 7287 } 7288 7289 int 7290 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 7291 { 7292 7293 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 7294 cnp->cn_flags &= ~NOEXECCHECK; 7295 return (0); 7296 } 7297 7298 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); 7299 } 7300 7301 /* 7302 * Do not use this variant unless you have means other than the hold count 7303 * to prevent the vnode from getting freed. 7304 */ 7305 void 7306 vn_seqc_write_begin_locked(struct vnode *vp) 7307 { 7308 7309 ASSERT_VI_LOCKED(vp, __func__); 7310 VNPASS(vp->v_holdcnt > 0, vp); 7311 VNPASS(vp->v_seqc_users >= 0, vp); 7312 vp->v_seqc_users++; 7313 if (vp->v_seqc_users == 1) 7314 seqc_sleepable_write_begin(&vp->v_seqc); 7315 } 7316 7317 void 7318 vn_seqc_write_begin(struct vnode *vp) 7319 { 7320 7321 VI_LOCK(vp); 7322 vn_seqc_write_begin_locked(vp); 7323 VI_UNLOCK(vp); 7324 } 7325 7326 void 7327 vn_seqc_write_end_locked(struct vnode *vp) 7328 { 7329 7330 ASSERT_VI_LOCKED(vp, __func__); 7331 VNPASS(vp->v_seqc_users > 0, vp); 7332 vp->v_seqc_users--; 7333 if (vp->v_seqc_users == 0) 7334 seqc_sleepable_write_end(&vp->v_seqc); 7335 } 7336 7337 void 7338 vn_seqc_write_end(struct vnode *vp) 7339 { 7340 7341 VI_LOCK(vp); 7342 vn_seqc_write_end_locked(vp); 7343 VI_UNLOCK(vp); 7344 } 7345 7346 /* 7347 * Special case handling for allocating and freeing vnodes. 7348 * 7349 * The counter remains unchanged on free so that a doomed vnode will 7350 * keep testing as in modify as long as it is accessible with SMR. 7351 */ 7352 static void 7353 vn_seqc_init(struct vnode *vp) 7354 { 7355 7356 vp->v_seqc = 0; 7357 vp->v_seqc_users = 0; 7358 } 7359 7360 static void 7361 vn_seqc_write_end_free(struct vnode *vp) 7362 { 7363 7364 VNPASS(seqc_in_modify(vp->v_seqc), vp); 7365 VNPASS(vp->v_seqc_users == 1, vp); 7366 } 7367 7368 void 7369 vn_irflag_set_locked(struct vnode *vp, short toset) 7370 { 7371 short flags; 7372 7373 ASSERT_VI_LOCKED(vp, __func__); 7374 flags = vn_irflag_read(vp); 7375 VNASSERT((flags & toset) == 0, vp, 7376 ("%s: some of the passed flags already set (have %d, passed %d)\n", 7377 __func__, flags, toset)); 7378 atomic_store_short(&vp->v_irflag, flags | toset); 7379 } 7380 7381 void 7382 vn_irflag_set(struct vnode *vp, short toset) 7383 { 7384 7385 VI_LOCK(vp); 7386 vn_irflag_set_locked(vp, toset); 7387 VI_UNLOCK(vp); 7388 } 7389 7390 void 7391 vn_irflag_set_cond_locked(struct vnode *vp, short toset) 7392 { 7393 short flags; 7394 7395 ASSERT_VI_LOCKED(vp, __func__); 7396 flags = vn_irflag_read(vp); 7397 atomic_store_short(&vp->v_irflag, flags | toset); 7398 } 7399 7400 void 7401 vn_irflag_set_cond(struct vnode *vp, short toset) 7402 { 7403 7404 VI_LOCK(vp); 7405 vn_irflag_set_cond_locked(vp, toset); 7406 VI_UNLOCK(vp); 7407 } 7408 7409 void 7410 vn_irflag_unset_locked(struct vnode *vp, short tounset) 7411 { 7412 short flags; 7413 7414 ASSERT_VI_LOCKED(vp, __func__); 7415 flags = vn_irflag_read(vp); 7416 VNASSERT((flags & tounset) == tounset, vp, 7417 ("%s: some of the passed flags not set (have %d, passed %d)\n", 7418 __func__, flags, tounset)); 7419 atomic_store_short(&vp->v_irflag, flags & ~tounset); 7420 } 7421 7422 void 7423 vn_irflag_unset(struct vnode *vp, short tounset) 7424 { 7425 7426 VI_LOCK(vp); 7427 vn_irflag_unset_locked(vp, tounset); 7428 VI_UNLOCK(vp); 7429 } 7430 7431 int 7432 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred) 7433 { 7434 struct vattr vattr; 7435 int error; 7436 7437 ASSERT_VOP_LOCKED(vp, __func__); 7438 error = VOP_GETATTR(vp, &vattr, cred); 7439 if (__predict_true(error == 0)) { 7440 if (vattr.va_size <= OFF_MAX) 7441 *size = vattr.va_size; 7442 else 7443 error = EFBIG; 7444 } 7445 return (error); 7446 } 7447 7448 int 7449 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred) 7450 { 7451 int error; 7452 7453 VOP_LOCK(vp, LK_SHARED); 7454 error = vn_getsize_locked(vp, size, cred); 7455 VOP_UNLOCK(vp); 7456 return (error); 7457 } 7458 7459 #ifdef INVARIANTS 7460 void 7461 vn_set_state_validate(struct vnode *vp, __enum_uint8(vstate) state) 7462 { 7463 7464 switch (vp->v_state) { 7465 case VSTATE_UNINITIALIZED: 7466 switch (state) { 7467 case VSTATE_CONSTRUCTED: 7468 case VSTATE_DESTROYING: 7469 return; 7470 default: 7471 break; 7472 } 7473 break; 7474 case VSTATE_CONSTRUCTED: 7475 ASSERT_VOP_ELOCKED(vp, __func__); 7476 switch (state) { 7477 case VSTATE_DESTROYING: 7478 return; 7479 default: 7480 break; 7481 } 7482 break; 7483 case VSTATE_DESTROYING: 7484 ASSERT_VOP_ELOCKED(vp, __func__); 7485 switch (state) { 7486 case VSTATE_DEAD: 7487 return; 7488 default: 7489 break; 7490 } 7491 break; 7492 case VSTATE_DEAD: 7493 switch (state) { 7494 case VSTATE_UNINITIALIZED: 7495 return; 7496 default: 7497 break; 7498 } 7499 break; 7500 } 7501 7502 vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state); 7503 panic("invalid state transition %d -> %d\n", vp->v_state, state); 7504 } 7505 #endif 7506