1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include "opt_ddb.h" 42 #include "opt_watchdog.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/asan.h> 47 #include <sys/bio.h> 48 #include <sys/buf.h> 49 #include <sys/capsicum.h> 50 #include <sys/condvar.h> 51 #include <sys/conf.h> 52 #include <sys/counter.h> 53 #include <sys/dirent.h> 54 #include <sys/event.h> 55 #include <sys/eventhandler.h> 56 #include <sys/extattr.h> 57 #include <sys/file.h> 58 #include <sys/fcntl.h> 59 #include <sys/inotify.h> 60 #include <sys/jail.h> 61 #include <sys/kdb.h> 62 #include <sys/kernel.h> 63 #include <sys/kthread.h> 64 #include <sys/ktr.h> 65 #include <sys/limits.h> 66 #include <sys/lockf.h> 67 #include <sys/malloc.h> 68 #include <sys/mount.h> 69 #include <sys/namei.h> 70 #include <sys/pctrie.h> 71 #include <sys/priv.h> 72 #include <sys/reboot.h> 73 #include <sys/refcount.h> 74 #include <sys/rwlock.h> 75 #include <sys/sched.h> 76 #include <sys/sleepqueue.h> 77 #include <sys/smr.h> 78 #include <sys/smp.h> 79 #include <sys/stat.h> 80 #include <sys/stdarg.h> 81 #include <sys/sysctl.h> 82 #include <sys/syslog.h> 83 #include <sys/user.h> 84 #include <sys/vmmeter.h> 85 #include <sys/vnode.h> 86 #include <sys/watchdog.h> 87 88 #include <security/mac/mac_framework.h> 89 90 #include <vm/vm.h> 91 #include <vm/vm_object.h> 92 #include <vm/vm_extern.h> 93 #include <vm/pmap.h> 94 #include <vm/vm_map.h> 95 #include <vm/vm_page.h> 96 #include <vm/vm_kern.h> 97 #include <vm/vnode_pager.h> 98 #include <vm/uma.h> 99 100 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS)) 101 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS 102 #endif 103 104 #ifdef DDB 105 #include <ddb/ddb.h> 106 #endif 107 108 static void delmntque(struct vnode *vp); 109 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 110 int slpflag, int slptimeo); 111 static void syncer_shutdown(void *arg, int howto); 112 static int vtryrecycle(struct vnode *vp, bool isvnlru); 113 static void v_init_counters(struct vnode *); 114 static void vn_seqc_init(struct vnode *); 115 static void vn_seqc_write_end_free(struct vnode *vp); 116 static void vgonel(struct vnode *); 117 static bool vhold_recycle_free(struct vnode *); 118 static void vdropl_recycle(struct vnode *vp); 119 static void vdrop_recycle(struct vnode *vp); 120 static void vfs_knllock(void *arg); 121 static void vfs_knlunlock(void *arg); 122 static void vfs_knl_assert_lock(void *arg, int what); 123 static void destroy_vpollinfo(struct vpollinfo *vi); 124 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 125 daddr_t startlbn, daddr_t endlbn); 126 static void vnlru_recalc(void); 127 128 static SYSCTL_NODE(_vfs, OID_AUTO, vnode, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 129 "vnode configuration and statistics"); 130 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 131 "vnode configuration"); 132 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 133 "vnode statistics"); 134 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, vnlru, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 135 "vnode recycling"); 136 137 /* 138 * Number of vnodes in existence. Increased whenever getnewvnode() 139 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 140 */ 141 static u_long __exclusive_cache_line numvnodes; 142 143 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 144 "Number of vnodes in existence (legacy)"); 145 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, count, CTLFLAG_RD, &numvnodes, 0, 146 "Number of vnodes in existence"); 147 148 static counter_u64_t vnodes_created; 149 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 150 "Number of vnodes created by getnewvnode (legacy)"); 151 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, created, CTLFLAG_RD, &vnodes_created, 152 "Number of vnodes created by getnewvnode"); 153 154 /* 155 * Conversion tables for conversion from vnode types to inode formats 156 * and back. 157 */ 158 __enum_uint8(vtype) iftovt_tab[16] = { 159 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 160 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 161 }; 162 int vttoif_tab[10] = { 163 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 164 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 165 }; 166 167 /* 168 * List of allocates vnodes in the system. 169 */ 170 static TAILQ_HEAD(freelst, vnode) vnode_list; 171 static struct vnode *vnode_list_free_marker; 172 static struct vnode *vnode_list_reclaim_marker; 173 174 /* 175 * "Free" vnode target. Free vnodes are rarely completely free, but are 176 * just ones that are cheap to recycle. Usually they are for files which 177 * have been stat'd but not read; these usually have inode and namecache 178 * data attached to them. This target is the preferred minimum size of a 179 * sub-cache consisting mostly of such files. The system balances the size 180 * of this sub-cache with its complement to try to prevent either from 181 * thrashing while the other is relatively inactive. The targets express 182 * a preference for the best balance. 183 * 184 * "Above" this target there are 2 further targets (watermarks) related 185 * to recyling of free vnodes. In the best-operating case, the cache is 186 * exactly full, the free list has size between vlowat and vhiwat above the 187 * free target, and recycling from it and normal use maintains this state. 188 * Sometimes the free list is below vlowat or even empty, but this state 189 * is even better for immediate use provided the cache is not full. 190 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 191 * ones) to reach one of these states. The watermarks are currently hard- 192 * coded as 4% and 9% of the available space higher. These and the default 193 * of 25% for wantfreevnodes are too large if the memory size is large. 194 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 195 * whenever vnlru_proc() becomes active. 196 */ 197 static long wantfreevnodes; 198 static long __exclusive_cache_line freevnodes; 199 static long freevnodes_old; 200 201 static u_long recycles_count; 202 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, &recycles_count, 0, 203 "Number of vnodes recycled to meet vnode cache targets (legacy)"); 204 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, 205 &recycles_count, 0, 206 "Number of vnodes recycled to meet vnode cache targets"); 207 208 static u_long recycles_free_count; 209 SYSCTL_ULONG(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, 210 &recycles_free_count, 0, 211 "Number of free vnodes recycled to meet vnode cache targets (legacy)"); 212 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, 213 &recycles_free_count, 0, 214 "Number of free vnodes recycled to meet vnode cache targets"); 215 216 static counter_u64_t direct_recycles_free_count; 217 SYSCTL_COUNTER_U64(_vfs_vnode_vnlru, OID_AUTO, direct_recycles_free, CTLFLAG_RD, 218 &direct_recycles_free_count, 219 "Number of free vnodes recycled by vn_alloc callers to meet vnode cache targets"); 220 221 static counter_u64_t vnode_skipped_requeues; 222 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, skipped_requeues, CTLFLAG_RD, &vnode_skipped_requeues, 223 "Number of times LRU requeue was skipped due to lock contention"); 224 225 static __read_mostly bool vnode_can_skip_requeue; 226 SYSCTL_BOOL(_vfs_vnode_param, OID_AUTO, can_skip_requeue, CTLFLAG_RW, 227 &vnode_can_skip_requeue, 0, "Is LRU requeue skippable"); 228 229 static u_long deferred_inact; 230 SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, 231 &deferred_inact, 0, "Number of times inactive processing was deferred"); 232 233 /* To keep more than one thread at a time from running vfs_getnewfsid */ 234 static struct mtx mntid_mtx; 235 236 /* 237 * Lock for any access to the following: 238 * vnode_list 239 * numvnodes 240 * freevnodes 241 */ 242 static struct mtx __exclusive_cache_line vnode_list_mtx; 243 244 /* Publicly exported FS */ 245 struct nfs_public nfs_pub; 246 247 static uma_zone_t buf_trie_zone; 248 static smr_t buf_trie_smr; 249 250 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 251 static uma_zone_t vnode_zone; 252 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); 253 254 __read_frequently smr_t vfs_smr; 255 256 /* 257 * The workitem queue. 258 * 259 * It is useful to delay writes of file data and filesystem metadata 260 * for tens of seconds so that quickly created and deleted files need 261 * not waste disk bandwidth being created and removed. To realize this, 262 * we append vnodes to a "workitem" queue. When running with a soft 263 * updates implementation, most pending metadata dependencies should 264 * not wait for more than a few seconds. Thus, mounted on block devices 265 * are delayed only about a half the time that file data is delayed. 266 * Similarly, directory updates are more critical, so are only delayed 267 * about a third the time that file data is delayed. Thus, there are 268 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 269 * one each second (driven off the filesystem syncer process). The 270 * syncer_delayno variable indicates the next queue that is to be processed. 271 * Items that need to be processed soon are placed in this queue: 272 * 273 * syncer_workitem_pending[syncer_delayno] 274 * 275 * A delay of fifteen seconds is done by placing the request fifteen 276 * entries later in the queue: 277 * 278 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 279 * 280 */ 281 static int syncer_delayno; 282 static long syncer_mask; 283 LIST_HEAD(synclist, bufobj); 284 static struct synclist *syncer_workitem_pending; 285 /* 286 * The sync_mtx protects: 287 * bo->bo_synclist 288 * sync_vnode_count 289 * syncer_delayno 290 * syncer_state 291 * syncer_workitem_pending 292 * syncer_worklist_len 293 * rushjob 294 */ 295 static struct mtx sync_mtx; 296 static struct cv sync_wakeup; 297 298 #define SYNCER_MAXDELAY 32 299 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 300 static int syncdelay = 30; /* max time to delay syncing data */ 301 static int filedelay = 30; /* time to delay syncing files */ 302 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 303 "Time to delay syncing files (in seconds)"); 304 static int dirdelay = 29; /* time to delay syncing directories */ 305 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 306 "Time to delay syncing directories (in seconds)"); 307 static int metadelay = 28; /* time to delay syncing metadata */ 308 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 309 "Time to delay syncing metadata (in seconds)"); 310 static int rushjob; /* number of slots to run ASAP */ 311 static int stat_rush_requests; /* number of times I/O speeded up */ 312 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 313 "Number of times I/O speeded up (rush requests)"); 314 315 #define VDBATCH_SIZE 8 316 struct vdbatch { 317 u_int index; 318 struct mtx lock; 319 struct vnode *tab[VDBATCH_SIZE]; 320 }; 321 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 322 323 static void vdbatch_dequeue(struct vnode *vp); 324 325 /* 326 * The syncer will require at least SYNCER_MAXDELAY iterations to shutdown; 327 * we probably don't want to pause for the whole second each time. 328 */ 329 #define SYNCER_SHUTDOWN_SPEEDUP 32 330 static int sync_vnode_count; 331 static int syncer_worklist_len; 332 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 333 syncer_state; 334 335 /* Target for maximum number of vnodes. */ 336 u_long desiredvnodes; 337 static u_long gapvnodes; /* gap between wanted and desired */ 338 static u_long vhiwat; /* enough extras after expansion */ 339 static u_long vlowat; /* minimal extras before expansion */ 340 static bool vstir; /* nonzero to stir non-free vnodes */ 341 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 342 343 static u_long vnlru_read_freevnodes(void); 344 345 /* 346 * Note that no attempt is made to sanitize these parameters. 347 */ 348 static int 349 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 350 { 351 u_long val; 352 int error; 353 354 val = desiredvnodes; 355 error = sysctl_handle_long(oidp, &val, 0, req); 356 if (error != 0 || req->newptr == NULL) 357 return (error); 358 359 if (val == desiredvnodes) 360 return (0); 361 mtx_lock(&vnode_list_mtx); 362 desiredvnodes = val; 363 wantfreevnodes = desiredvnodes / 4; 364 vnlru_recalc(); 365 mtx_unlock(&vnode_list_mtx); 366 /* 367 * XXX There is no protection against multiple threads changing 368 * desiredvnodes at the same time. Locking above only helps vnlru and 369 * getnewvnode. 370 */ 371 vfs_hash_changesize(desiredvnodes); 372 cache_changesize(desiredvnodes); 373 return (0); 374 } 375 376 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 377 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 378 "LU", "Target for maximum number of vnodes (legacy)"); 379 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, limit, 380 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 381 "LU", "Target for maximum number of vnodes"); 382 383 static int 384 sysctl_freevnodes(SYSCTL_HANDLER_ARGS) 385 { 386 u_long rfreevnodes; 387 388 rfreevnodes = vnlru_read_freevnodes(); 389 return (sysctl_handle_long(oidp, &rfreevnodes, 0, req)); 390 } 391 392 SYSCTL_PROC(_vfs, OID_AUTO, freevnodes, 393 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, 394 "LU", "Number of \"free\" vnodes (legacy)"); 395 SYSCTL_PROC(_vfs_vnode_stats, OID_AUTO, free, 396 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, 397 "LU", "Number of \"free\" vnodes"); 398 399 static int 400 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 401 { 402 u_long val; 403 int error; 404 405 val = wantfreevnodes; 406 error = sysctl_handle_long(oidp, &val, 0, req); 407 if (error != 0 || req->newptr == NULL) 408 return (error); 409 410 if (val == wantfreevnodes) 411 return (0); 412 mtx_lock(&vnode_list_mtx); 413 wantfreevnodes = val; 414 vnlru_recalc(); 415 mtx_unlock(&vnode_list_mtx); 416 return (0); 417 } 418 419 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 420 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 421 "LU", "Target for minimum number of \"free\" vnodes (legacy)"); 422 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, wantfree, 423 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 424 "LU", "Target for minimum number of \"free\" vnodes"); 425 426 static int vnlru_nowhere; 427 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, failed_runs, CTLFLAG_RD | CTLFLAG_STATS, 428 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 429 430 static int 431 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 432 { 433 struct vnode *vp; 434 struct nameidata nd; 435 char *buf; 436 unsigned long ndflags; 437 int error; 438 439 if (req->newptr == NULL) 440 return (EINVAL); 441 if (req->newlen >= PATH_MAX) 442 return (E2BIG); 443 444 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 445 error = SYSCTL_IN(req, buf, req->newlen); 446 if (error != 0) 447 goto out; 448 449 buf[req->newlen] = '\0'; 450 451 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; 452 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); 453 if ((error = namei(&nd)) != 0) 454 goto out; 455 vp = nd.ni_vp; 456 457 if (VN_IS_DOOMED(vp)) { 458 /* 459 * This vnode is being recycled. Return != 0 to let the caller 460 * know that the sysctl had no effect. Return EAGAIN because a 461 * subsequent call will likely succeed (since namei will create 462 * a new vnode if necessary) 463 */ 464 error = EAGAIN; 465 goto putvnode; 466 } 467 468 vgone(vp); 469 putvnode: 470 vput(vp); 471 NDFREE_PNBUF(&nd); 472 out: 473 free(buf, M_TEMP); 474 return (error); 475 } 476 477 static int 478 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 479 { 480 struct thread *td = curthread; 481 struct vnode *vp; 482 struct file *fp; 483 int error; 484 int fd; 485 486 if (req->newptr == NULL) 487 return (EBADF); 488 489 error = sysctl_handle_int(oidp, &fd, 0, req); 490 if (error != 0) 491 return (error); 492 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 493 if (error != 0) 494 return (error); 495 vp = fp->f_vnode; 496 497 error = vn_lock(vp, LK_EXCLUSIVE); 498 if (error != 0) 499 goto drop; 500 501 vgone(vp); 502 VOP_UNLOCK(vp); 503 drop: 504 fdrop(fp, td); 505 return (error); 506 } 507 508 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 509 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 510 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 511 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 512 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 513 sysctl_ftry_reclaim_vnode, "I", 514 "Try to reclaim a vnode by its file descriptor"); 515 516 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 517 #define vnsz2log 8 518 #ifndef DEBUG_LOCKS 519 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && 520 sizeof(struct vnode) < 1UL << (vnsz2log + 1), 521 "vnsz2log needs to be updated"); 522 #endif 523 524 /* 525 * Support for the bufobj clean & dirty pctrie. 526 */ 527 static void * 528 buf_trie_alloc(struct pctrie *ptree) 529 { 530 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 531 } 532 533 static void 534 buf_trie_free(struct pctrie *ptree, void *node) 535 { 536 uma_zfree_smr(buf_trie_zone, node); 537 } 538 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 539 buf_trie_smr); 540 541 /* 542 * Lookup the next element greater than or equal to lblkno, accounting for the 543 * fact that, for pctries, negative values are greater than nonnegative ones. 544 */ 545 static struct buf * 546 buf_lookup_ge(struct bufv *bv, daddr_t lblkno) 547 { 548 struct buf *bp; 549 550 bp = BUF_PCTRIE_LOOKUP_GE(&bv->bv_root, lblkno); 551 if (bp == NULL && lblkno < 0) 552 bp = BUF_PCTRIE_LOOKUP_GE(&bv->bv_root, 0); 553 if (bp != NULL && bp->b_lblkno < lblkno) 554 bp = NULL; 555 return (bp); 556 } 557 558 /* 559 * Insert bp, and find the next element smaller than bp, accounting for the fact 560 * that, for pctries, negative values are greater than nonnegative ones. 561 */ 562 static int 563 buf_insert_lookup_le(struct bufv *bv, struct buf *bp, struct buf **n) 564 { 565 int error; 566 567 error = BUF_PCTRIE_INSERT_LOOKUP_LE(&bv->bv_root, bp, n); 568 if (error != EEXIST) { 569 if (*n == NULL && bp->b_lblkno >= 0) 570 *n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, ~0L); 571 if (*n != NULL && (*n)->b_lblkno >= bp->b_lblkno) 572 *n = NULL; 573 } 574 return (error); 575 } 576 577 /* 578 * Initialize the vnode management data structures. 579 * 580 * Reevaluate the following cap on the number of vnodes after the physical 581 * memory size exceeds 512GB. In the limit, as the physical memory size 582 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 583 */ 584 #ifndef MAXVNODES_MAX 585 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 586 #endif 587 588 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 589 590 static struct vnode * 591 vn_alloc_marker(struct mount *mp) 592 { 593 struct vnode *vp; 594 595 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 596 vp->v_type = VMARKER; 597 vp->v_mount = mp; 598 599 return (vp); 600 } 601 602 static void 603 vn_free_marker(struct vnode *vp) 604 { 605 606 MPASS(vp->v_type == VMARKER); 607 free(vp, M_VNODE_MARKER); 608 } 609 610 #ifdef KASAN 611 static int 612 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) 613 { 614 kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); 615 return (0); 616 } 617 618 static void 619 vnode_dtor(void *mem, int size, void *arg __unused) 620 { 621 size_t end1, end2, off1, off2; 622 623 _Static_assert(offsetof(struct vnode, v_vnodelist) < 624 offsetof(struct vnode, v_dbatchcpu), 625 "KASAN marks require updating"); 626 627 off1 = offsetof(struct vnode, v_vnodelist); 628 off2 = offsetof(struct vnode, v_dbatchcpu); 629 end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); 630 end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); 631 632 /* 633 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even 634 * after the vnode has been freed. Try to get some KASAN coverage by 635 * marking everything except those two fields as invalid. Because 636 * KASAN's tracking is not byte-granular, any preceding fields sharing 637 * the same 8-byte aligned word must also be marked valid. 638 */ 639 640 /* Handle the area from the start until v_vnodelist... */ 641 off1 = rounddown2(off1, KASAN_SHADOW_SCALE); 642 kasan_mark(mem, off1, off1, KASAN_UMA_FREED); 643 644 /* ... then the area between v_vnodelist and v_dbatchcpu ... */ 645 off1 = roundup2(end1, KASAN_SHADOW_SCALE); 646 off2 = rounddown2(off2, KASAN_SHADOW_SCALE); 647 if (off2 > off1) 648 kasan_mark((void *)((char *)mem + off1), off2 - off1, 649 off2 - off1, KASAN_UMA_FREED); 650 651 /* ... and finally the area from v_dbatchcpu to the end. */ 652 off2 = roundup2(end2, KASAN_SHADOW_SCALE); 653 kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, 654 KASAN_UMA_FREED); 655 } 656 #endif /* KASAN */ 657 658 /* 659 * Initialize a vnode as it first enters the zone. 660 */ 661 static int 662 vnode_init(void *mem, int size, int flags) 663 { 664 struct vnode *vp; 665 666 vp = mem; 667 bzero(vp, size); 668 /* 669 * Setup locks. 670 */ 671 vp->v_vnlock = &vp->v_lock; 672 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 673 /* 674 * By default, don't allow shared locks unless filesystems opt-in. 675 */ 676 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 677 LK_NOSHARE | LK_IS_VNODE); 678 /* 679 * Initialize bufobj. 680 */ 681 bufobj_init(&vp->v_bufobj, vp); 682 /* 683 * Initialize namecache. 684 */ 685 cache_vnode_init(vp); 686 /* 687 * Initialize rangelocks. 688 */ 689 rangelock_init(&vp->v_rl); 690 691 vp->v_dbatchcpu = NOCPU; 692 693 vp->v_state = VSTATE_DEAD; 694 695 /* 696 * Check vhold_recycle_free for an explanation. 697 */ 698 vp->v_holdcnt = VHOLD_NO_SMR; 699 vp->v_type = VNON; 700 mtx_lock(&vnode_list_mtx); 701 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 702 mtx_unlock(&vnode_list_mtx); 703 return (0); 704 } 705 706 /* 707 * Free a vnode when it is cleared from the zone. 708 */ 709 static void 710 vnode_fini(void *mem, int size) 711 { 712 struct vnode *vp; 713 struct bufobj *bo; 714 715 vp = mem; 716 vdbatch_dequeue(vp); 717 mtx_lock(&vnode_list_mtx); 718 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 719 mtx_unlock(&vnode_list_mtx); 720 rangelock_destroy(&vp->v_rl); 721 lockdestroy(vp->v_vnlock); 722 mtx_destroy(&vp->v_interlock); 723 bo = &vp->v_bufobj; 724 rw_destroy(BO_LOCKPTR(bo)); 725 726 kasan_mark(mem, size, size, 0); 727 } 728 729 /* 730 * Provide the size of NFS nclnode and NFS fh for calculation of the 731 * vnode memory consumption. The size is specified directly to 732 * eliminate dependency on NFS-private header. 733 * 734 * Other filesystems may use bigger or smaller (like UFS and ZFS) 735 * private inode data, but the NFS-based estimation is ample enough. 736 * Still, we care about differences in the size between 64- and 32-bit 737 * platforms. 738 * 739 * Namecache structure size is heuristically 740 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 741 */ 742 #ifdef _LP64 743 #define NFS_NCLNODE_SZ (528 + 64) 744 #define NC_SZ 148 745 #else 746 #define NFS_NCLNODE_SZ (360 + 32) 747 #define NC_SZ 92 748 #endif 749 750 static void 751 vntblinit(void *dummy __unused) 752 { 753 struct vdbatch *vd; 754 uma_ctor ctor; 755 uma_dtor dtor; 756 int cpu, physvnodes, virtvnodes; 757 758 /* 759 * 'desiredvnodes' is the minimum of a function of the physical memory 760 * size and another of the kernel heap size (UMA limit, a portion of the 761 * KVA). 762 * 763 * Currently, on 64-bit platforms, 'desiredvnodes' is set to 764 * 'virtvnodes' up to a physical memory cutoff of ~1722MB, after which 765 * 'physvnodes' applies instead. With the current automatic tuning for 766 * 'maxfiles' (32 files/MB), 'desiredvnodes' is always greater than it. 767 */ 768 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 32 + 769 min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 32; 770 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 771 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 772 desiredvnodes = min(physvnodes, virtvnodes); 773 if (desiredvnodes > MAXVNODES_MAX) { 774 if (bootverbose) 775 printf("Reducing kern.maxvnodes %lu -> %lu\n", 776 desiredvnodes, MAXVNODES_MAX); 777 desiredvnodes = MAXVNODES_MAX; 778 } 779 wantfreevnodes = desiredvnodes / 4; 780 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 781 TAILQ_INIT(&vnode_list); 782 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 783 /* 784 * The lock is taken to appease WITNESS. 785 */ 786 mtx_lock(&vnode_list_mtx); 787 vnlru_recalc(); 788 mtx_unlock(&vnode_list_mtx); 789 vnode_list_free_marker = vn_alloc_marker(NULL); 790 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 791 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 792 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 793 794 #ifdef KASAN 795 ctor = vnode_ctor; 796 dtor = vnode_dtor; 797 #else 798 ctor = NULL; 799 dtor = NULL; 800 #endif 801 vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, 802 vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); 803 uma_zone_set_smr(vnode_zone, vfs_smr); 804 805 /* 806 * Preallocate enough nodes to support one-per buf so that 807 * we can not fail an insert. reassignbuf() callers can not 808 * tolerate the insertion failure. 809 */ 810 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 811 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 812 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 813 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 814 uma_prealloc(buf_trie_zone, nbuf); 815 816 vnodes_created = counter_u64_alloc(M_WAITOK); 817 direct_recycles_free_count = counter_u64_alloc(M_WAITOK); 818 vnode_skipped_requeues = counter_u64_alloc(M_WAITOK); 819 820 /* 821 * Initialize the filesystem syncer. 822 */ 823 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 824 &syncer_mask); 825 syncer_maxdelay = syncer_mask + 1; 826 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 827 cv_init(&sync_wakeup, "syncer"); 828 829 CPU_FOREACH(cpu) { 830 vd = DPCPU_ID_PTR((cpu), vd); 831 bzero(vd, sizeof(*vd)); 832 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 833 } 834 } 835 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 836 837 /* 838 * Mark a mount point as busy. Used to synchronize access and to delay 839 * unmounting. Eventually, mountlist_mtx is not released on failure. 840 * 841 * vfs_busy() is a custom lock, it can block the caller. 842 * vfs_busy() only sleeps if the unmount is active on the mount point. 843 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 844 * vnode belonging to mp. 845 * 846 * Lookup uses vfs_busy() to traverse mount points. 847 * root fs var fs 848 * / vnode lock A / vnode lock (/var) D 849 * /var vnode lock B /log vnode lock(/var/log) E 850 * vfs_busy lock C vfs_busy lock F 851 * 852 * Within each file system, the lock order is C->A->B and F->D->E. 853 * 854 * When traversing across mounts, the system follows that lock order: 855 * 856 * C->A->B 857 * | 858 * +->F->D->E 859 * 860 * The lookup() process for namei("/var") illustrates the process: 861 * 1. VOP_LOOKUP() obtains B while A is held 862 * 2. vfs_busy() obtains a shared lock on F while A and B are held 863 * 3. vput() releases lock on B 864 * 4. vput() releases lock on A 865 * 5. VFS_ROOT() obtains lock on D while shared lock on F is held 866 * 6. vfs_unbusy() releases shared lock on F 867 * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 868 * Attempt to lock A (instead of vp_crossmp) while D is held would 869 * violate the global order, causing deadlocks. 870 * 871 * dounmount() locks B while F is drained. Note that for stacked 872 * filesystems, D and B in the example above may be the same lock, 873 * which introdues potential lock order reversal deadlock between 874 * dounmount() and step 5 above. These filesystems may avoid the LOR 875 * by setting VV_CROSSLOCK on the covered vnode so that lock B will 876 * remain held until after step 5. 877 */ 878 int 879 vfs_busy(struct mount *mp, int flags) 880 { 881 struct mount_pcpu *mpcpu; 882 883 MPASS((flags & ~MBF_MASK) == 0); 884 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 885 886 if (vfs_op_thread_enter(mp, mpcpu)) { 887 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 888 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 889 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 890 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 891 vfs_mp_count_add_pcpu(mpcpu, lockref, 1); 892 vfs_op_thread_exit(mp, mpcpu); 893 if (flags & MBF_MNTLSTLOCK) 894 mtx_unlock(&mountlist_mtx); 895 return (0); 896 } 897 898 MNT_ILOCK(mp); 899 vfs_assert_mount_counters(mp); 900 MNT_REF(mp); 901 /* 902 * If mount point is currently being unmounted, sleep until the 903 * mount point fate is decided. If thread doing the unmounting fails, 904 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 905 * that this mount point has survived the unmount attempt and vfs_busy 906 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 907 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 908 * about to be really destroyed. vfs_busy needs to release its 909 * reference on the mount point in this case and return with ENOENT, 910 * telling the caller the mount it tried to busy is no longer valid. 911 */ 912 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 913 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), 914 ("%s: non-empty upper mount list with pending unmount", 915 __func__)); 916 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 917 MNT_REL(mp); 918 MNT_IUNLOCK(mp); 919 CTR1(KTR_VFS, "%s: failed busying before sleeping", 920 __func__); 921 return (ENOENT); 922 } 923 if (flags & MBF_MNTLSTLOCK) 924 mtx_unlock(&mountlist_mtx); 925 mp->mnt_kern_flag |= MNTK_MWAIT; 926 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 927 if (flags & MBF_MNTLSTLOCK) 928 mtx_lock(&mountlist_mtx); 929 MNT_ILOCK(mp); 930 } 931 if (flags & MBF_MNTLSTLOCK) 932 mtx_unlock(&mountlist_mtx); 933 mp->mnt_lockref++; 934 MNT_IUNLOCK(mp); 935 return (0); 936 } 937 938 /* 939 * Free a busy filesystem. 940 */ 941 void 942 vfs_unbusy(struct mount *mp) 943 { 944 struct mount_pcpu *mpcpu; 945 int c; 946 947 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 948 949 if (vfs_op_thread_enter(mp, mpcpu)) { 950 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 951 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); 952 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 953 vfs_op_thread_exit(mp, mpcpu); 954 return; 955 } 956 957 MNT_ILOCK(mp); 958 vfs_assert_mount_counters(mp); 959 MNT_REL(mp); 960 c = --mp->mnt_lockref; 961 if (mp->mnt_vfs_ops == 0) { 962 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 963 MNT_IUNLOCK(mp); 964 return; 965 } 966 if (c < 0) 967 vfs_dump_mount_counters(mp); 968 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 969 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 970 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 971 mp->mnt_kern_flag &= ~MNTK_DRAINING; 972 wakeup(&mp->mnt_lockref); 973 } 974 MNT_IUNLOCK(mp); 975 } 976 977 /* 978 * Lookup a mount point by filesystem identifier. 979 */ 980 struct mount * 981 vfs_getvfs(fsid_t *fsid) 982 { 983 struct mount *mp; 984 985 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 986 mtx_lock(&mountlist_mtx); 987 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 988 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 989 vfs_ref(mp); 990 mtx_unlock(&mountlist_mtx); 991 return (mp); 992 } 993 } 994 mtx_unlock(&mountlist_mtx); 995 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 996 return ((struct mount *) 0); 997 } 998 999 /* 1000 * Lookup a mount point by filesystem identifier, busying it before 1001 * returning. 1002 * 1003 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 1004 * cache for popular filesystem identifiers. The cache is lockess, using 1005 * the fact that struct mount's are never freed. In worst case we may 1006 * get pointer to unmounted or even different filesystem, so we have to 1007 * check what we got, and go slow way if so. 1008 */ 1009 struct mount * 1010 vfs_busyfs(fsid_t *fsid) 1011 { 1012 #define FSID_CACHE_SIZE 256 1013 typedef struct mount * volatile vmp_t; 1014 static vmp_t cache[FSID_CACHE_SIZE]; 1015 struct mount *mp; 1016 int error; 1017 uint32_t hash; 1018 1019 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 1020 hash = fsid->val[0] ^ fsid->val[1]; 1021 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 1022 mp = cache[hash]; 1023 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 1024 goto slow; 1025 if (vfs_busy(mp, 0) != 0) { 1026 cache[hash] = NULL; 1027 goto slow; 1028 } 1029 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 1030 return (mp); 1031 else 1032 vfs_unbusy(mp); 1033 1034 slow: 1035 mtx_lock(&mountlist_mtx); 1036 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 1037 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 1038 error = vfs_busy(mp, MBF_MNTLSTLOCK); 1039 if (error) { 1040 cache[hash] = NULL; 1041 mtx_unlock(&mountlist_mtx); 1042 return (NULL); 1043 } 1044 cache[hash] = mp; 1045 return (mp); 1046 } 1047 } 1048 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 1049 mtx_unlock(&mountlist_mtx); 1050 return ((struct mount *) 0); 1051 } 1052 1053 /* 1054 * Check if a user can access privileged mount options. 1055 */ 1056 int 1057 vfs_suser(struct mount *mp, struct thread *td) 1058 { 1059 int error; 1060 1061 if (jailed(td->td_ucred)) { 1062 /* 1063 * If the jail of the calling thread lacks permission for 1064 * this type of file system, deny immediately. 1065 */ 1066 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 1067 return (EPERM); 1068 1069 /* 1070 * If the file system was mounted outside the jail of the 1071 * calling thread, deny immediately. 1072 */ 1073 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 1074 return (EPERM); 1075 } 1076 1077 /* 1078 * If file system supports delegated administration, we don't check 1079 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 1080 * by the file system itself. 1081 * If this is not the user that did original mount, we check for 1082 * the PRIV_VFS_MOUNT_OWNER privilege. 1083 */ 1084 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 1085 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 1086 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 1087 return (error); 1088 } 1089 return (0); 1090 } 1091 1092 /* 1093 * Get a new unique fsid. Try to make its val[0] unique, since this value 1094 * will be used to create fake device numbers for stat(). Also try (but 1095 * not so hard) make its val[0] unique mod 2^16, since some emulators only 1096 * support 16-bit device numbers. We end up with unique val[0]'s for the 1097 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 1098 * 1099 * Keep in mind that several mounts may be running in parallel. Starting 1100 * the search one past where the previous search terminated is both a 1101 * micro-optimization and a defense against returning the same fsid to 1102 * different mounts. 1103 */ 1104 void 1105 vfs_getnewfsid(struct mount *mp) 1106 { 1107 static uint16_t mntid_base; 1108 struct mount *nmp; 1109 fsid_t tfsid; 1110 int mtype; 1111 1112 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 1113 mtx_lock(&mntid_mtx); 1114 mtype = mp->mnt_vfc->vfc_typenum; 1115 tfsid.val[1] = mtype; 1116 mtype = (mtype & 0xFF) << 24; 1117 for (;;) { 1118 tfsid.val[0] = makedev(255, 1119 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 1120 mntid_base++; 1121 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 1122 break; 1123 vfs_rel(nmp); 1124 } 1125 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 1126 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 1127 mtx_unlock(&mntid_mtx); 1128 } 1129 1130 /* 1131 * Knob to control the precision of file timestamps: 1132 * 1133 * 0 = seconds only; nanoseconds zeroed. 1134 * 1 = seconds and nanoseconds, accurate within 1/HZ. 1135 * 2 = seconds and nanoseconds, truncated to microseconds. 1136 * >=3 = seconds and nanoseconds, maximum precision. 1137 */ 1138 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 1139 1140 static int timestamp_precision = TSP_USEC; 1141 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 1142 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 1143 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 1144 "3+: sec + ns (max. precision))"); 1145 1146 /* 1147 * Get a current timestamp. 1148 */ 1149 void 1150 vfs_timestamp(struct timespec *tsp) 1151 { 1152 struct timeval tv; 1153 1154 switch (timestamp_precision) { 1155 case TSP_SEC: 1156 tsp->tv_sec = time_second; 1157 tsp->tv_nsec = 0; 1158 break; 1159 case TSP_HZ: 1160 getnanotime(tsp); 1161 break; 1162 case TSP_USEC: 1163 microtime(&tv); 1164 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1165 break; 1166 case TSP_NSEC: 1167 default: 1168 nanotime(tsp); 1169 break; 1170 } 1171 } 1172 1173 /* 1174 * Set vnode attributes to VNOVAL 1175 */ 1176 void 1177 vattr_null(struct vattr *vap) 1178 { 1179 1180 vap->va_type = VNON; 1181 vap->va_size = VNOVAL; 1182 vap->va_bytes = VNOVAL; 1183 vap->va_mode = VNOVAL; 1184 vap->va_nlink = VNOVAL; 1185 vap->va_uid = VNOVAL; 1186 vap->va_gid = VNOVAL; 1187 vap->va_fsid = VNOVAL; 1188 vap->va_fileid = VNOVAL; 1189 vap->va_blocksize = VNOVAL; 1190 vap->va_rdev = VNOVAL; 1191 vap->va_atime.tv_sec = VNOVAL; 1192 vap->va_atime.tv_nsec = VNOVAL; 1193 vap->va_mtime.tv_sec = VNOVAL; 1194 vap->va_mtime.tv_nsec = VNOVAL; 1195 vap->va_ctime.tv_sec = VNOVAL; 1196 vap->va_ctime.tv_nsec = VNOVAL; 1197 vap->va_birthtime.tv_sec = VNOVAL; 1198 vap->va_birthtime.tv_nsec = VNOVAL; 1199 vap->va_flags = VNOVAL; 1200 vap->va_gen = VNOVAL; 1201 vap->va_vaflags = 0; 1202 vap->va_filerev = VNOVAL; 1203 vap->va_bsdflags = 0; 1204 } 1205 1206 /* 1207 * Try to reduce the total number of vnodes. 1208 * 1209 * This routine (and its user) are buggy in at least the following ways: 1210 * - all parameters were picked years ago when RAM sizes were significantly 1211 * smaller 1212 * - it can pick vnodes based on pages used by the vm object, but filesystems 1213 * like ZFS don't use it making the pick broken 1214 * - since ZFS has its own aging policy it gets partially combated by this one 1215 * - a dedicated method should be provided for filesystems to let them decide 1216 * whether the vnode should be recycled 1217 * 1218 * This routine is called when we have too many vnodes. It attempts 1219 * to free <count> vnodes and will potentially free vnodes that still 1220 * have VM backing store (VM backing store is typically the cause 1221 * of a vnode blowout so we want to do this). Therefore, this operation 1222 * is not considered cheap. 1223 * 1224 * A number of conditions may prevent a vnode from being reclaimed. 1225 * the buffer cache may have references on the vnode, a directory 1226 * vnode may still have references due to the namei cache representing 1227 * underlying files, or the vnode may be in active use. It is not 1228 * desirable to reuse such vnodes. These conditions may cause the 1229 * number of vnodes to reach some minimum value regardless of what 1230 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1231 * 1232 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1233 * entries if this argument is strue 1234 * @param trigger Only reclaim vnodes with fewer than this many resident 1235 * pages. 1236 * @param target How many vnodes to reclaim. 1237 * @return The number of vnodes that were reclaimed. 1238 */ 1239 static int 1240 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1241 { 1242 struct vnode *vp, *mvp; 1243 struct mount *mp; 1244 struct vm_object *object; 1245 u_long done; 1246 bool retried; 1247 1248 mtx_assert(&vnode_list_mtx, MA_OWNED); 1249 1250 retried = false; 1251 done = 0; 1252 1253 mvp = vnode_list_reclaim_marker; 1254 restart: 1255 vp = mvp; 1256 while (done < target) { 1257 vp = TAILQ_NEXT(vp, v_vnodelist); 1258 if (__predict_false(vp == NULL)) 1259 break; 1260 1261 if (__predict_false(vp->v_type == VMARKER)) 1262 continue; 1263 1264 /* 1265 * If it's been deconstructed already, it's still 1266 * referenced, or it exceeds the trigger, skip it. 1267 * Also skip free vnodes. We are trying to make space 1268 * for more free vnodes, not reduce their count. 1269 */ 1270 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1271 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1272 goto next_iter; 1273 1274 if (vp->v_type == VBAD || vp->v_type == VNON) 1275 goto next_iter; 1276 1277 object = atomic_load_ptr(&vp->v_object); 1278 if (object == NULL || object->resident_page_count > trigger) { 1279 goto next_iter; 1280 } 1281 1282 /* 1283 * Handle races against vnode allocation. Filesystems lock the 1284 * vnode some time after it gets returned from getnewvnode, 1285 * despite type and hold count being manipulated earlier. 1286 * Resorting to checking v_mount restores guarantees present 1287 * before the global list was reworked to contain all vnodes. 1288 */ 1289 if (!VI_TRYLOCK(vp)) 1290 goto next_iter; 1291 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1292 VI_UNLOCK(vp); 1293 goto next_iter; 1294 } 1295 if (vp->v_mount == NULL) { 1296 VI_UNLOCK(vp); 1297 goto next_iter; 1298 } 1299 vholdl(vp); 1300 VI_UNLOCK(vp); 1301 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1302 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1303 mtx_unlock(&vnode_list_mtx); 1304 1305 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1306 vdrop_recycle(vp); 1307 goto next_iter_unlocked; 1308 } 1309 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1310 vdrop_recycle(vp); 1311 vn_finished_write(mp); 1312 goto next_iter_unlocked; 1313 } 1314 1315 VI_LOCK(vp); 1316 if (vp->v_usecount > 0 || 1317 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1318 (vp->v_object != NULL && vp->v_object->handle == vp && 1319 vp->v_object->resident_page_count > trigger)) { 1320 VOP_UNLOCK(vp); 1321 vdropl_recycle(vp); 1322 vn_finished_write(mp); 1323 goto next_iter_unlocked; 1324 } 1325 recycles_count++; 1326 vgonel(vp); 1327 VOP_UNLOCK(vp); 1328 vdropl_recycle(vp); 1329 vn_finished_write(mp); 1330 done++; 1331 next_iter_unlocked: 1332 maybe_yield(); 1333 mtx_lock(&vnode_list_mtx); 1334 goto restart; 1335 next_iter: 1336 MPASS(vp->v_type != VMARKER); 1337 if (!should_yield()) 1338 continue; 1339 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1340 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1341 mtx_unlock(&vnode_list_mtx); 1342 kern_yield(PRI_USER); 1343 mtx_lock(&vnode_list_mtx); 1344 goto restart; 1345 } 1346 if (done == 0 && !retried) { 1347 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1348 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1349 retried = true; 1350 goto restart; 1351 } 1352 return (done); 1353 } 1354 1355 static int max_free_per_call = 10000; 1356 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_free_per_call, 0, 1357 "limit on vnode free requests per call to the vnlru_free routine (legacy)"); 1358 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, max_free_per_call, CTLFLAG_RW, 1359 &max_free_per_call, 0, 1360 "limit on vnode free requests per call to the vnlru_free routine"); 1361 1362 /* 1363 * Attempt to recycle requested amount of free vnodes. 1364 */ 1365 static int 1366 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp, bool isvnlru) 1367 { 1368 struct vnode *vp; 1369 struct mount *mp; 1370 int ocount; 1371 bool retried; 1372 1373 mtx_assert(&vnode_list_mtx, MA_OWNED); 1374 if (count > max_free_per_call) 1375 count = max_free_per_call; 1376 if (count == 0) { 1377 mtx_unlock(&vnode_list_mtx); 1378 return (0); 1379 } 1380 ocount = count; 1381 retried = false; 1382 vp = mvp; 1383 for (;;) { 1384 vp = TAILQ_NEXT(vp, v_vnodelist); 1385 if (__predict_false(vp == NULL)) { 1386 /* 1387 * The free vnode marker can be past eligible vnodes: 1388 * 1. if vdbatch_process trylock failed 1389 * 2. if vtryrecycle failed 1390 * 1391 * If so, start the scan from scratch. 1392 */ 1393 if (!retried && vnlru_read_freevnodes() > 0) { 1394 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1395 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1396 vp = mvp; 1397 retried = true; 1398 continue; 1399 } 1400 1401 /* 1402 * Give up 1403 */ 1404 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1405 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1406 mtx_unlock(&vnode_list_mtx); 1407 break; 1408 } 1409 if (__predict_false(vp->v_type == VMARKER)) 1410 continue; 1411 if (vp->v_holdcnt > 0) 1412 continue; 1413 /* 1414 * Don't recycle if our vnode is from different type 1415 * of mount point. Note that mp is type-safe, the 1416 * check does not reach unmapped address even if 1417 * vnode is reclaimed. 1418 */ 1419 if (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1420 mp->mnt_op != mnt_op) { 1421 continue; 1422 } 1423 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1424 continue; 1425 } 1426 if (!vhold_recycle_free(vp)) 1427 continue; 1428 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1429 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1430 mtx_unlock(&vnode_list_mtx); 1431 /* 1432 * FIXME: ignores the return value, meaning it may be nothing 1433 * got recycled but it claims otherwise to the caller. 1434 * 1435 * Originally the value started being ignored in 2005 with 1436 * 114a1006a8204aa156e1f9ad6476cdff89cada7f . 1437 * 1438 * Respecting the value can run into significant stalls if most 1439 * vnodes belong to one file system and it has writes 1440 * suspended. In presence of many threads and millions of 1441 * vnodes they keep contending on the vnode_list_mtx lock only 1442 * to find vnodes they can't recycle. 1443 * 1444 * The solution would be to pre-check if the vnode is likely to 1445 * be recycle-able, but it needs to happen with the 1446 * vnode_list_mtx lock held. This runs into a problem where 1447 * VOP_GETWRITEMOUNT (currently needed to find out about if 1448 * writes are frozen) can take locks which LOR against it. 1449 * 1450 * Check nullfs for one example (null_getwritemount). 1451 */ 1452 vtryrecycle(vp, isvnlru); 1453 count--; 1454 if (count == 0) { 1455 break; 1456 } 1457 mtx_lock(&vnode_list_mtx); 1458 vp = mvp; 1459 } 1460 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1461 return (ocount - count); 1462 } 1463 1464 /* 1465 * XXX: returns without vnode_list_mtx locked! 1466 */ 1467 static int 1468 vnlru_free_locked_direct(int count) 1469 { 1470 int ret; 1471 1472 mtx_assert(&vnode_list_mtx, MA_OWNED); 1473 ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, false); 1474 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1475 return (ret); 1476 } 1477 1478 static int 1479 vnlru_free_locked_vnlru(int count) 1480 { 1481 int ret; 1482 1483 mtx_assert(&vnode_list_mtx, MA_OWNED); 1484 ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, true); 1485 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1486 return (ret); 1487 } 1488 1489 static int 1490 vnlru_free_vnlru(int count) 1491 { 1492 1493 mtx_lock(&vnode_list_mtx); 1494 return (vnlru_free_locked_vnlru(count)); 1495 } 1496 1497 void 1498 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) 1499 { 1500 1501 MPASS(mnt_op != NULL); 1502 MPASS(mvp != NULL); 1503 VNPASS(mvp->v_type == VMARKER, mvp); 1504 mtx_lock(&vnode_list_mtx); 1505 vnlru_free_impl(count, mnt_op, mvp, true); 1506 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1507 } 1508 1509 struct vnode * 1510 vnlru_alloc_marker(void) 1511 { 1512 struct vnode *mvp; 1513 1514 mvp = vn_alloc_marker(NULL); 1515 mtx_lock(&vnode_list_mtx); 1516 TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); 1517 mtx_unlock(&vnode_list_mtx); 1518 return (mvp); 1519 } 1520 1521 void 1522 vnlru_free_marker(struct vnode *mvp) 1523 { 1524 mtx_lock(&vnode_list_mtx); 1525 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1526 mtx_unlock(&vnode_list_mtx); 1527 vn_free_marker(mvp); 1528 } 1529 1530 static void 1531 vnlru_recalc(void) 1532 { 1533 1534 mtx_assert(&vnode_list_mtx, MA_OWNED); 1535 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1536 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1537 vlowat = vhiwat / 2; 1538 } 1539 1540 /* 1541 * Attempt to recycle vnodes in a context that is always safe to block. 1542 * Calling vlrurecycle() from the bowels of filesystem code has some 1543 * interesting deadlock problems. 1544 */ 1545 static struct proc *vnlruproc; 1546 static int vnlruproc_sig; 1547 static u_long vnlruproc_kicks; 1548 1549 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, kicks, CTLFLAG_RD, &vnlruproc_kicks, 0, 1550 "Number of times vnlru awakened due to vnode shortage"); 1551 1552 #define VNLRU_COUNT_SLOP 100 1553 1554 /* 1555 * The main freevnodes counter is only updated when a counter local to CPU 1556 * diverges from 0 by more than VNLRU_FREEVNODES_SLOP. CPUs are conditionally 1557 * walked to compute a more accurate total. 1558 * 1559 * Note: the actual value at any given moment can still exceed slop, but it 1560 * should not be by significant margin in practice. 1561 */ 1562 #define VNLRU_FREEVNODES_SLOP 126 1563 1564 static void __noinline 1565 vfs_freevnodes_rollup(int8_t *lfreevnodes) 1566 { 1567 1568 atomic_add_long(&freevnodes, *lfreevnodes); 1569 *lfreevnodes = 0; 1570 critical_exit(); 1571 } 1572 1573 static __inline void 1574 vfs_freevnodes_inc(void) 1575 { 1576 int8_t *lfreevnodes; 1577 1578 critical_enter(); 1579 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1580 (*lfreevnodes)++; 1581 if (__predict_false(*lfreevnodes == VNLRU_FREEVNODES_SLOP)) 1582 vfs_freevnodes_rollup(lfreevnodes); 1583 else 1584 critical_exit(); 1585 } 1586 1587 static __inline void 1588 vfs_freevnodes_dec(void) 1589 { 1590 int8_t *lfreevnodes; 1591 1592 critical_enter(); 1593 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1594 (*lfreevnodes)--; 1595 if (__predict_false(*lfreevnodes == -VNLRU_FREEVNODES_SLOP)) 1596 vfs_freevnodes_rollup(lfreevnodes); 1597 else 1598 critical_exit(); 1599 } 1600 1601 static u_long 1602 vnlru_read_freevnodes(void) 1603 { 1604 long slop, rfreevnodes, rfreevnodes_old; 1605 int cpu; 1606 1607 rfreevnodes = atomic_load_long(&freevnodes); 1608 rfreevnodes_old = atomic_load_long(&freevnodes_old); 1609 1610 if (rfreevnodes > rfreevnodes_old) 1611 slop = rfreevnodes - rfreevnodes_old; 1612 else 1613 slop = rfreevnodes_old - rfreevnodes; 1614 if (slop < VNLRU_FREEVNODES_SLOP) 1615 return (rfreevnodes >= 0 ? rfreevnodes : 0); 1616 CPU_FOREACH(cpu) { 1617 rfreevnodes += cpuid_to_pcpu[cpu]->pc_vfs_freevnodes; 1618 } 1619 atomic_store_long(&freevnodes_old, rfreevnodes); 1620 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1621 } 1622 1623 static bool 1624 vnlru_under(u_long rnumvnodes, u_long limit) 1625 { 1626 u_long rfreevnodes, space; 1627 1628 if (__predict_false(rnumvnodes > desiredvnodes)) 1629 return (true); 1630 1631 space = desiredvnodes - rnumvnodes; 1632 if (space < limit) { 1633 rfreevnodes = vnlru_read_freevnodes(); 1634 if (rfreevnodes > wantfreevnodes) 1635 space += rfreevnodes - wantfreevnodes; 1636 } 1637 return (space < limit); 1638 } 1639 1640 static void 1641 vnlru_kick_locked(void) 1642 { 1643 1644 mtx_assert(&vnode_list_mtx, MA_OWNED); 1645 if (vnlruproc_sig == 0) { 1646 vnlruproc_sig = 1; 1647 vnlruproc_kicks++; 1648 wakeup(vnlruproc); 1649 } 1650 } 1651 1652 static void 1653 vnlru_kick_cond(void) 1654 { 1655 1656 if (vnlru_read_freevnodes() > wantfreevnodes) 1657 return; 1658 1659 if (vnlruproc_sig) 1660 return; 1661 mtx_lock(&vnode_list_mtx); 1662 vnlru_kick_locked(); 1663 mtx_unlock(&vnode_list_mtx); 1664 } 1665 1666 static void 1667 vnlru_proc_sleep(void) 1668 { 1669 1670 if (vnlruproc_sig) { 1671 vnlruproc_sig = 0; 1672 wakeup(&vnlruproc_sig); 1673 } 1674 msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); 1675 } 1676 1677 /* 1678 * A lighter version of the machinery below. 1679 * 1680 * Tries to reach goals only by recycling free vnodes and does not invoke 1681 * uma_reclaim(UMA_RECLAIM_DRAIN). 1682 * 1683 * This works around pathological behavior in vnlru in presence of tons of free 1684 * vnodes, but without having to rewrite the machinery at this time. Said 1685 * behavior boils down to continuously trying to reclaim all kinds of vnodes 1686 * (cycling through all levels of "force") when the count is transiently above 1687 * limit. This happens a lot when all vnodes are used up and vn_alloc 1688 * speculatively increments the counter. 1689 * 1690 * Sample testcase: vnode limit 8388608, 20 separate directory trees each with 1691 * 1 million files in total and 20 find(1) processes stating them in parallel 1692 * (one per each tree). 1693 * 1694 * On a kernel with only stock machinery this needs anywhere between 60 and 120 1695 * seconds to execute (time varies *wildly* between runs). With the workaround 1696 * it consistently stays around 20 seconds [it got further down with later 1697 * changes]. 1698 * 1699 * That is to say the entire thing needs a fundamental redesign (most notably 1700 * to accommodate faster recycling), the above only tries to get it ouf the way. 1701 * 1702 * Return values are: 1703 * -1 -- fallback to regular vnlru loop 1704 * 0 -- do nothing, go to sleep 1705 * >0 -- recycle this many vnodes 1706 */ 1707 static long 1708 vnlru_proc_light_pick(void) 1709 { 1710 u_long rnumvnodes, rfreevnodes; 1711 1712 if (vstir || vnlruproc_sig == 1) 1713 return (-1); 1714 1715 rnumvnodes = atomic_load_long(&numvnodes); 1716 rfreevnodes = vnlru_read_freevnodes(); 1717 1718 /* 1719 * vnode limit might have changed and now we may be at a significant 1720 * excess. Bail if we can't sort it out with free vnodes. 1721 * 1722 * Due to atomic updates the count can legitimately go above 1723 * the limit for a short period, don't bother doing anything in 1724 * that case. 1725 */ 1726 if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP + 10) { 1727 if (rnumvnodes - rfreevnodes >= desiredvnodes || 1728 rfreevnodes <= wantfreevnodes) { 1729 return (-1); 1730 } 1731 1732 return (rnumvnodes - desiredvnodes); 1733 } 1734 1735 /* 1736 * Don't try to reach wantfreevnodes target if there are too few vnodes 1737 * to begin with. 1738 */ 1739 if (rnumvnodes < wantfreevnodes) { 1740 return (0); 1741 } 1742 1743 if (rfreevnodes < wantfreevnodes) { 1744 return (-1); 1745 } 1746 1747 return (0); 1748 } 1749 1750 static bool 1751 vnlru_proc_light(void) 1752 { 1753 long freecount; 1754 1755 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1756 1757 freecount = vnlru_proc_light_pick(); 1758 if (freecount == -1) 1759 return (false); 1760 1761 if (freecount != 0) { 1762 vnlru_free_vnlru(freecount); 1763 } 1764 1765 mtx_lock(&vnode_list_mtx); 1766 vnlru_proc_sleep(); 1767 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1768 return (true); 1769 } 1770 1771 static u_long uma_reclaim_calls; 1772 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, uma_reclaim_calls, CTLFLAG_RD | CTLFLAG_STATS, 1773 &uma_reclaim_calls, 0, "Number of calls to uma_reclaim"); 1774 1775 static void 1776 vnlru_proc(void) 1777 { 1778 u_long rnumvnodes, rfreevnodes, target; 1779 unsigned long onumvnodes; 1780 int done, force, trigger, usevnodes; 1781 bool reclaim_nc_src, want_reread; 1782 1783 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1784 SHUTDOWN_PRI_FIRST); 1785 1786 force = 0; 1787 want_reread = false; 1788 for (;;) { 1789 kproc_suspend_check(vnlruproc); 1790 1791 if (force == 0 && vnlru_proc_light()) 1792 continue; 1793 1794 mtx_lock(&vnode_list_mtx); 1795 rnumvnodes = atomic_load_long(&numvnodes); 1796 1797 if (want_reread) { 1798 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1799 want_reread = false; 1800 } 1801 1802 /* 1803 * If numvnodes is too large (due to desiredvnodes being 1804 * adjusted using its sysctl, or emergency growth), first 1805 * try to reduce it by discarding free vnodes. 1806 */ 1807 if (rnumvnodes > desiredvnodes + 10) { 1808 vnlru_free_locked_vnlru(rnumvnodes - desiredvnodes); 1809 mtx_lock(&vnode_list_mtx); 1810 rnumvnodes = atomic_load_long(&numvnodes); 1811 } 1812 /* 1813 * Sleep if the vnode cache is in a good state. This is 1814 * when it is not over-full and has space for about a 4% 1815 * or 9% expansion (by growing its size or inexcessively 1816 * reducing free vnode count). Otherwise, try to reclaim 1817 * space for a 10% expansion. 1818 */ 1819 if (vstir && force == 0) { 1820 force = 1; 1821 vstir = false; 1822 } 1823 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1824 vnlru_proc_sleep(); 1825 continue; 1826 } 1827 rfreevnodes = vnlru_read_freevnodes(); 1828 1829 onumvnodes = rnumvnodes; 1830 /* 1831 * Calculate parameters for recycling. These are the same 1832 * throughout the loop to give some semblance of fairness. 1833 * The trigger point is to avoid recycling vnodes with lots 1834 * of resident pages. We aren't trying to free memory; we 1835 * are trying to recycle or at least free vnodes. 1836 */ 1837 if (rnumvnodes <= desiredvnodes) 1838 usevnodes = rnumvnodes - rfreevnodes; 1839 else 1840 usevnodes = rnumvnodes; 1841 if (usevnodes <= 0) 1842 usevnodes = 1; 1843 /* 1844 * The trigger value is chosen to give a conservatively 1845 * large value to ensure that it alone doesn't prevent 1846 * making progress. The value can easily be so large that 1847 * it is effectively infinite in some congested and 1848 * misconfigured cases, and this is necessary. Normally 1849 * it is about 8 to 100 (pages), which is quite large. 1850 */ 1851 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1852 if (force < 2) 1853 trigger = vsmalltrigger; 1854 reclaim_nc_src = force >= 3; 1855 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1856 target = target / 10 + 1; 1857 done = vlrureclaim(reclaim_nc_src, trigger, target); 1858 mtx_unlock(&vnode_list_mtx); 1859 /* 1860 * Total number of vnodes can transiently go slightly above the 1861 * limit (see vn_alloc_hard), no need to call uma_reclaim if 1862 * this happens. 1863 */ 1864 if (onumvnodes + VNLRU_COUNT_SLOP + 1000 > desiredvnodes && 1865 numvnodes <= desiredvnodes) { 1866 uma_reclaim_calls++; 1867 uma_reclaim(UMA_RECLAIM_DRAIN); 1868 } 1869 if (done == 0) { 1870 if (force == 0 || force == 1) { 1871 force = 2; 1872 continue; 1873 } 1874 if (force == 2) { 1875 force = 3; 1876 continue; 1877 } 1878 want_reread = true; 1879 force = 0; 1880 vnlru_nowhere++; 1881 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1882 } else { 1883 want_reread = true; 1884 kern_yield(PRI_USER); 1885 } 1886 } 1887 } 1888 1889 static struct kproc_desc vnlru_kp = { 1890 "vnlru", 1891 vnlru_proc, 1892 &vnlruproc 1893 }; 1894 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1895 &vnlru_kp); 1896 1897 /* 1898 * Routines having to do with the management of the vnode table. 1899 */ 1900 1901 /* 1902 * Try to recycle a freed vnode. 1903 */ 1904 static int 1905 vtryrecycle(struct vnode *vp, bool isvnlru) 1906 { 1907 struct mount *vnmp; 1908 1909 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1910 VNPASS(vp->v_holdcnt > 0, vp); 1911 /* 1912 * This vnode may found and locked via some other list, if so we 1913 * can't recycle it yet. 1914 */ 1915 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1916 CTR2(KTR_VFS, 1917 "%s: impossible to recycle, vp %p lock is already held", 1918 __func__, vp); 1919 vdrop_recycle(vp); 1920 return (EWOULDBLOCK); 1921 } 1922 /* 1923 * Don't recycle if its filesystem is being suspended. 1924 */ 1925 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1926 VOP_UNLOCK(vp); 1927 CTR2(KTR_VFS, 1928 "%s: impossible to recycle, cannot start the write for %p", 1929 __func__, vp); 1930 vdrop_recycle(vp); 1931 return (EBUSY); 1932 } 1933 /* 1934 * If we got this far, we need to acquire the interlock and see if 1935 * anyone picked up this vnode from another list. If not, we will 1936 * mark it with DOOMED via vgonel() so that anyone who does find it 1937 * will skip over it. 1938 */ 1939 VI_LOCK(vp); 1940 if (vp->v_usecount) { 1941 VOP_UNLOCK(vp); 1942 vdropl_recycle(vp); 1943 vn_finished_write(vnmp); 1944 CTR2(KTR_VFS, 1945 "%s: impossible to recycle, %p is already referenced", 1946 __func__, vp); 1947 return (EBUSY); 1948 } 1949 if (!VN_IS_DOOMED(vp)) { 1950 if (isvnlru) 1951 recycles_free_count++; 1952 else 1953 counter_u64_add(direct_recycles_free_count, 1); 1954 vgonel(vp); 1955 } 1956 VOP_UNLOCK(vp); 1957 vdropl_recycle(vp); 1958 vn_finished_write(vnmp); 1959 return (0); 1960 } 1961 1962 /* 1963 * Allocate a new vnode. 1964 * 1965 * The operation never returns an error. Returning an error was disabled 1966 * in r145385 (dated 2005) with the following comment: 1967 * 1968 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1969 * 1970 * Given the age of this commit (almost 15 years at the time of writing this 1971 * comment) restoring the ability to fail requires a significant audit of 1972 * all codepaths. 1973 * 1974 * The routine can try to free a vnode or stall for up to 1 second waiting for 1975 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1976 */ 1977 static u_long vn_alloc_cyclecount; 1978 static u_long vn_alloc_sleeps; 1979 1980 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, alloc_sleeps, CTLFLAG_RD, &vn_alloc_sleeps, 0, 1981 "Number of times vnode allocation blocked waiting on vnlru"); 1982 1983 static struct vnode * __noinline 1984 vn_alloc_hard(struct mount *mp, u_long rnumvnodes, bool bumped) 1985 { 1986 u_long rfreevnodes; 1987 1988 if (bumped) { 1989 if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP) { 1990 atomic_subtract_long(&numvnodes, 1); 1991 bumped = false; 1992 } 1993 } 1994 1995 mtx_lock(&vnode_list_mtx); 1996 1997 /* 1998 * Reload 'numvnodes', as since we acquired the lock, it may have 1999 * changed significantly if we waited, and 'rnumvnodes' above was only 2000 * actually passed if 'bumped' is true (else it is 0). 2001 */ 2002 rnumvnodes = atomic_load_long(&numvnodes); 2003 if (rnumvnodes + !bumped < desiredvnodes) { 2004 vn_alloc_cyclecount = 0; 2005 mtx_unlock(&vnode_list_mtx); 2006 goto alloc; 2007 } 2008 2009 rfreevnodes = vnlru_read_freevnodes(); 2010 if (vn_alloc_cyclecount++ >= rfreevnodes) { 2011 vn_alloc_cyclecount = 0; 2012 vstir = true; 2013 } 2014 2015 /* 2016 * Grow the vnode cache if it will not be above its target max after 2017 * growing. Otherwise, if there is at least one free vnode, try to 2018 * reclaim 1 item from it before growing the cache (possibly above its 2019 * target max if the reclamation failed or is delayed). 2020 */ 2021 if (vnlru_free_locked_direct(1) > 0) 2022 goto alloc; 2023 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 2024 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2025 /* 2026 * Wait for space for a new vnode. 2027 */ 2028 if (bumped) { 2029 atomic_subtract_long(&numvnodes, 1); 2030 bumped = false; 2031 } 2032 mtx_lock(&vnode_list_mtx); 2033 vnlru_kick_locked(); 2034 vn_alloc_sleeps++; 2035 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 2036 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 2037 vnlru_read_freevnodes() > 1) 2038 vnlru_free_locked_direct(1); 2039 else 2040 mtx_unlock(&vnode_list_mtx); 2041 } 2042 alloc: 2043 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 2044 if (!bumped) 2045 atomic_add_long(&numvnodes, 1); 2046 vnlru_kick_cond(); 2047 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 2048 } 2049 2050 static struct vnode * 2051 vn_alloc(struct mount *mp) 2052 { 2053 u_long rnumvnodes; 2054 2055 if (__predict_false(vn_alloc_cyclecount != 0)) 2056 return (vn_alloc_hard(mp, 0, false)); 2057 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 2058 if (__predict_false(vnlru_under(rnumvnodes, vlowat))) { 2059 return (vn_alloc_hard(mp, rnumvnodes, true)); 2060 } 2061 2062 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 2063 } 2064 2065 static void 2066 vn_free(struct vnode *vp) 2067 { 2068 2069 atomic_subtract_long(&numvnodes, 1); 2070 uma_zfree_smr(vnode_zone, vp); 2071 } 2072 2073 /* 2074 * Allocate a new vnode. 2075 */ 2076 int 2077 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 2078 struct vnode **vpp) 2079 { 2080 struct vnode *vp; 2081 struct thread *td; 2082 struct lock_object *lo; 2083 2084 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 2085 2086 KASSERT(vops->registered, 2087 ("%s: not registered vector op %p\n", __func__, vops)); 2088 cache_validate_vop_vector(mp, vops); 2089 2090 td = curthread; 2091 if (td->td_vp_reserved != NULL) { 2092 vp = td->td_vp_reserved; 2093 td->td_vp_reserved = NULL; 2094 } else { 2095 vp = vn_alloc(mp); 2096 } 2097 counter_u64_add(vnodes_created, 1); 2098 2099 vn_set_state(vp, VSTATE_UNINITIALIZED); 2100 2101 /* 2102 * Locks are given the generic name "vnode" when created. 2103 * Follow the historic practice of using the filesystem 2104 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 2105 * 2106 * Locks live in a witness group keyed on their name. Thus, 2107 * when a lock is renamed, it must also move from the witness 2108 * group of its old name to the witness group of its new name. 2109 * 2110 * The change only needs to be made when the vnode moves 2111 * from one filesystem type to another. We ensure that each 2112 * filesystem use a single static name pointer for its tag so 2113 * that we can compare pointers rather than doing a strcmp(). 2114 */ 2115 lo = &vp->v_vnlock->lock_object; 2116 #ifdef WITNESS 2117 if (lo->lo_name != tag) { 2118 #endif 2119 lo->lo_name = tag; 2120 #ifdef WITNESS 2121 WITNESS_DESTROY(lo); 2122 WITNESS_INIT(lo, tag); 2123 } 2124 #endif 2125 /* 2126 * By default, don't allow shared locks unless filesystems opt-in. 2127 */ 2128 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 2129 /* 2130 * Finalize various vnode identity bits. 2131 */ 2132 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 2133 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 2134 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 2135 vp->v_type = VNON; 2136 vp->v_op = vops; 2137 vp->v_irflag = 0; 2138 v_init_counters(vp); 2139 vn_seqc_init(vp); 2140 vp->v_bufobj.bo_ops = &buf_ops_bio; 2141 #ifdef DIAGNOSTIC 2142 if (mp == NULL && vops != &dead_vnodeops) 2143 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 2144 #endif 2145 #ifdef MAC 2146 mac_vnode_init(vp); 2147 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 2148 mac_vnode_associate_singlelabel(mp, vp); 2149 #endif 2150 if (mp != NULL) { 2151 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 2152 } 2153 2154 /* 2155 * For the filesystems which do not use vfs_hash_insert(), 2156 * still initialize v_hash to have vfs_hash_index() useful. 2157 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 2158 * its own hashing. 2159 */ 2160 vp->v_hash = (uintptr_t)vp >> vnsz2log; 2161 2162 *vpp = vp; 2163 return (0); 2164 } 2165 2166 void 2167 getnewvnode_reserve(void) 2168 { 2169 struct thread *td; 2170 2171 td = curthread; 2172 MPASS(td->td_vp_reserved == NULL); 2173 td->td_vp_reserved = vn_alloc(NULL); 2174 } 2175 2176 void 2177 getnewvnode_drop_reserve(void) 2178 { 2179 struct thread *td; 2180 2181 td = curthread; 2182 if (td->td_vp_reserved != NULL) { 2183 vn_free(td->td_vp_reserved); 2184 td->td_vp_reserved = NULL; 2185 } 2186 } 2187 2188 static void __noinline 2189 freevnode(struct vnode *vp) 2190 { 2191 struct bufobj *bo; 2192 2193 /* 2194 * The vnode has been marked for destruction, so free it. 2195 * 2196 * The vnode will be returned to the zone where it will 2197 * normally remain until it is needed for another vnode. We 2198 * need to cleanup (or verify that the cleanup has already 2199 * been done) any residual data left from its current use 2200 * so as not to contaminate the freshly allocated vnode. 2201 */ 2202 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2203 /* 2204 * Paired with vgone. 2205 */ 2206 vn_seqc_write_end_free(vp); 2207 2208 bo = &vp->v_bufobj; 2209 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2210 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 2211 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2212 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2213 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2214 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2215 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 2216 ("clean blk trie not empty")); 2217 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2218 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 2219 ("dirty blk trie not empty")); 2220 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, 2221 ("Leaked inactivation")); 2222 VI_UNLOCK(vp); 2223 cache_assert_no_entries(vp); 2224 2225 #ifdef MAC 2226 mac_vnode_destroy(vp); 2227 #endif 2228 if (vp->v_pollinfo != NULL) { 2229 /* 2230 * Use LK_NOWAIT to shut up witness about the lock. We may get 2231 * here while having another vnode locked when trying to 2232 * satisfy a lookup and needing to recycle. 2233 */ 2234 VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); 2235 destroy_vpollinfo(vp->v_pollinfo); 2236 VOP_UNLOCK(vp); 2237 vp->v_pollinfo = NULL; 2238 } 2239 vp->v_mountedhere = NULL; 2240 vp->v_unpcb = NULL; 2241 vp->v_rdev = NULL; 2242 vp->v_fifoinfo = NULL; 2243 vp->v_iflag = 0; 2244 vp->v_vflag = 0; 2245 bo->bo_flag = 0; 2246 vn_free(vp); 2247 } 2248 2249 /* 2250 * Delete from old mount point vnode list, if on one. 2251 */ 2252 static void 2253 delmntque(struct vnode *vp) 2254 { 2255 struct mount *mp; 2256 2257 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 2258 2259 mp = vp->v_mount; 2260 MNT_ILOCK(mp); 2261 VI_LOCK(vp); 2262 vp->v_mount = NULL; 2263 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 2264 ("bad mount point vnode list size")); 2265 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2266 mp->mnt_nvnodelistsize--; 2267 MNT_REL(mp); 2268 MNT_IUNLOCK(mp); 2269 /* 2270 * The caller expects the interlock to be still held. 2271 */ 2272 ASSERT_VI_LOCKED(vp, __func__); 2273 } 2274 2275 static int 2276 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) 2277 { 2278 2279 KASSERT(vp->v_mount == NULL, 2280 ("insmntque: vnode already on per mount vnode list")); 2281 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 2282 if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { 2283 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 2284 } else { 2285 KASSERT(!dtr, 2286 ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", 2287 __func__)); 2288 } 2289 2290 /* 2291 * We acquire the vnode interlock early to ensure that the 2292 * vnode cannot be recycled by another process releasing a 2293 * holdcnt on it before we get it on both the vnode list 2294 * and the active vnode list. The mount mutex protects only 2295 * manipulation of the vnode list and the vnode freelist 2296 * mutex protects only manipulation of the active vnode list. 2297 * Hence the need to hold the vnode interlock throughout. 2298 */ 2299 MNT_ILOCK(mp); 2300 VI_LOCK(vp); 2301 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 2302 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 2303 mp->mnt_nvnodelistsize == 0)) && 2304 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 2305 VI_UNLOCK(vp); 2306 MNT_IUNLOCK(mp); 2307 if (dtr) { 2308 vp->v_data = NULL; 2309 vp->v_op = &dead_vnodeops; 2310 vgone(vp); 2311 vput(vp); 2312 } 2313 return (EBUSY); 2314 } 2315 vp->v_mount = mp; 2316 MNT_REF(mp); 2317 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2318 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 2319 ("neg mount point vnode list size")); 2320 mp->mnt_nvnodelistsize++; 2321 VI_UNLOCK(vp); 2322 MNT_IUNLOCK(mp); 2323 return (0); 2324 } 2325 2326 /* 2327 * Insert into list of vnodes for the new mount point, if available. 2328 * insmntque() reclaims the vnode on insertion failure, insmntque1() 2329 * leaves handling of the vnode to the caller. 2330 */ 2331 int 2332 insmntque(struct vnode *vp, struct mount *mp) 2333 { 2334 return (insmntque1_int(vp, mp, true)); 2335 } 2336 2337 int 2338 insmntque1(struct vnode *vp, struct mount *mp) 2339 { 2340 return (insmntque1_int(vp, mp, false)); 2341 } 2342 2343 /* 2344 * Flush out and invalidate all buffers associated with a bufobj 2345 * Called with the underlying object locked. 2346 */ 2347 int 2348 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 2349 { 2350 int error; 2351 2352 BO_LOCK(bo); 2353 if (flags & V_SAVE) { 2354 error = bufobj_wwait(bo, slpflag, slptimeo); 2355 if (error) { 2356 BO_UNLOCK(bo); 2357 return (error); 2358 } 2359 if (bo->bo_dirty.bv_cnt > 0) { 2360 BO_UNLOCK(bo); 2361 do { 2362 error = BO_SYNC(bo, MNT_WAIT); 2363 } while (error == ERELOOKUP); 2364 if (error != 0) 2365 return (error); 2366 BO_LOCK(bo); 2367 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { 2368 BO_UNLOCK(bo); 2369 return (EBUSY); 2370 } 2371 } 2372 } 2373 /* 2374 * If you alter this loop please notice that interlock is dropped and 2375 * reacquired in flushbuflist. Special care is needed to ensure that 2376 * no race conditions occur from this. 2377 */ 2378 do { 2379 error = flushbuflist(&bo->bo_clean, 2380 flags, bo, slpflag, slptimeo); 2381 if (error == 0 && !(flags & V_CLEANONLY)) 2382 error = flushbuflist(&bo->bo_dirty, 2383 flags, bo, slpflag, slptimeo); 2384 if (error != 0 && error != EAGAIN) { 2385 BO_UNLOCK(bo); 2386 return (error); 2387 } 2388 } while (error != 0); 2389 2390 /* 2391 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 2392 * have write I/O in-progress but if there is a VM object then the 2393 * VM object can also have read-I/O in-progress. 2394 */ 2395 do { 2396 bufobj_wwait(bo, 0, 0); 2397 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 2398 BO_UNLOCK(bo); 2399 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 2400 BO_LOCK(bo); 2401 } 2402 } while (bo->bo_numoutput > 0); 2403 BO_UNLOCK(bo); 2404 2405 /* 2406 * Destroy the copy in the VM cache, too. 2407 */ 2408 if (bo->bo_object != NULL && 2409 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 2410 VM_OBJECT_WLOCK(bo->bo_object); 2411 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 2412 OBJPR_CLEANONLY : 0); 2413 VM_OBJECT_WUNLOCK(bo->bo_object); 2414 } 2415 2416 #ifdef INVARIANTS 2417 BO_LOCK(bo); 2418 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 2419 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 2420 bo->bo_clean.bv_cnt > 0)) 2421 panic("vinvalbuf: flush failed"); 2422 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 2423 bo->bo_dirty.bv_cnt > 0) 2424 panic("vinvalbuf: flush dirty failed"); 2425 BO_UNLOCK(bo); 2426 #endif 2427 return (0); 2428 } 2429 2430 /* 2431 * Flush out and invalidate all buffers associated with a vnode. 2432 * Called with the underlying object locked. 2433 */ 2434 int 2435 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 2436 { 2437 2438 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2439 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 2440 if (vp->v_object != NULL && vp->v_object->handle != vp) 2441 return (0); 2442 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 2443 } 2444 2445 /* 2446 * Flush out buffers on the specified list. 2447 * 2448 */ 2449 static int 2450 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 2451 int slptimeo) 2452 { 2453 struct buf *bp, *nbp; 2454 int retval, error; 2455 daddr_t lblkno; 2456 b_xflags_t xflags; 2457 2458 ASSERT_BO_WLOCKED(bo); 2459 2460 retval = 0; 2461 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2462 /* 2463 * If we are flushing both V_NORMAL and V_ALT buffers then 2464 * do not skip any buffers. If we are flushing only V_NORMAL 2465 * buffers then skip buffers marked as BX_ALTDATA. If we are 2466 * flushing only V_ALT buffers then skip buffers not marked 2467 * as BX_ALTDATA. 2468 */ 2469 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2470 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2471 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2472 continue; 2473 } 2474 if (nbp != NULL) { 2475 lblkno = nbp->b_lblkno; 2476 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2477 } 2478 retval = EAGAIN; 2479 error = BUF_TIMELOCK(bp, 2480 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2481 "flushbuf", slpflag, slptimeo); 2482 if (error) { 2483 BO_LOCK(bo); 2484 return (error != ENOLCK ? error : EAGAIN); 2485 } 2486 KASSERT(bp->b_bufobj == bo, 2487 ("bp %p wrong b_bufobj %p should be %p", 2488 bp, bp->b_bufobj, bo)); 2489 /* 2490 * XXX Since there are no node locks for NFS, I 2491 * believe there is a slight chance that a delayed 2492 * write will occur while sleeping just above, so 2493 * check for it. 2494 */ 2495 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2496 (flags & V_SAVE)) { 2497 bremfree(bp); 2498 bp->b_flags |= B_ASYNC; 2499 bwrite(bp); 2500 BO_LOCK(bo); 2501 return (EAGAIN); /* XXX: why not loop ? */ 2502 } 2503 bremfree(bp); 2504 bp->b_flags |= (B_INVAL | B_RELBUF); 2505 bp->b_flags &= ~B_ASYNC; 2506 brelse(bp); 2507 BO_LOCK(bo); 2508 if (nbp == NULL) 2509 break; 2510 nbp = gbincore(bo, lblkno); 2511 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2512 != xflags) 2513 break; /* nbp invalid */ 2514 } 2515 return (retval); 2516 } 2517 2518 int 2519 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2520 { 2521 struct buf *bp; 2522 int error; 2523 daddr_t lblkno; 2524 2525 ASSERT_BO_LOCKED(bo); 2526 2527 for (lblkno = startn;;) { 2528 again: 2529 bp = buf_lookup_ge(bufv, lblkno); 2530 if (bp == NULL || bp->b_lblkno >= endn) 2531 break; 2532 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2533 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2534 if (error != 0) { 2535 BO_RLOCK(bo); 2536 if (error == ENOLCK) 2537 goto again; 2538 return (error); 2539 } 2540 KASSERT(bp->b_bufobj == bo, 2541 ("bp %p wrong b_bufobj %p should be %p", 2542 bp, bp->b_bufobj, bo)); 2543 lblkno = bp->b_lblkno + 1; 2544 if ((bp->b_flags & B_MANAGED) == 0) 2545 bremfree(bp); 2546 bp->b_flags |= B_RELBUF; 2547 /* 2548 * In the VMIO case, use the B_NOREUSE flag to hint that the 2549 * pages backing each buffer in the range are unlikely to be 2550 * reused. Dirty buffers will have the hint applied once 2551 * they've been written. 2552 */ 2553 if ((bp->b_flags & B_VMIO) != 0) 2554 bp->b_flags |= B_NOREUSE; 2555 brelse(bp); 2556 BO_RLOCK(bo); 2557 } 2558 return (0); 2559 } 2560 2561 /* 2562 * Truncate a file's buffer and pages to a specified length. This 2563 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2564 * sync activity. 2565 */ 2566 int 2567 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2568 { 2569 struct buf *bp, *nbp; 2570 struct bufobj *bo; 2571 daddr_t startlbn; 2572 2573 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2574 vp, blksize, (uintmax_t)length); 2575 2576 /* 2577 * Round up to the *next* lbn. 2578 */ 2579 startlbn = howmany(length, blksize); 2580 2581 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2582 2583 bo = &vp->v_bufobj; 2584 restart_unlocked: 2585 BO_LOCK(bo); 2586 2587 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2588 ; 2589 2590 if (length > 0) { 2591 /* 2592 * Write out vnode metadata, e.g. indirect blocks. 2593 */ 2594 restartsync: 2595 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2596 if (bp->b_lblkno >= 0) 2597 continue; 2598 /* 2599 * Since we hold the vnode lock this should only 2600 * fail if we're racing with the buf daemon. 2601 */ 2602 if (BUF_LOCK(bp, 2603 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2604 BO_LOCKPTR(bo)) == ENOLCK) 2605 goto restart_unlocked; 2606 2607 VNASSERT((bp->b_flags & B_DELWRI), vp, 2608 ("buf(%p) on dirty queue without DELWRI", bp)); 2609 2610 bremfree(bp); 2611 bawrite(bp); 2612 BO_LOCK(bo); 2613 goto restartsync; 2614 } 2615 } 2616 2617 bufobj_wwait(bo, 0, 0); 2618 BO_UNLOCK(bo); 2619 vnode_pager_setsize(vp, length); 2620 2621 return (0); 2622 } 2623 2624 /* 2625 * Invalidate the cached pages of a file's buffer within the range of block 2626 * numbers [startlbn, endlbn). 2627 */ 2628 void 2629 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2630 int blksize) 2631 { 2632 struct bufobj *bo; 2633 off_t start, end; 2634 2635 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2636 2637 start = blksize * startlbn; 2638 end = blksize * endlbn; 2639 2640 bo = &vp->v_bufobj; 2641 BO_LOCK(bo); 2642 MPASS(blksize == bo->bo_bsize); 2643 2644 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2645 ; 2646 2647 BO_UNLOCK(bo); 2648 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2649 } 2650 2651 static int 2652 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2653 daddr_t startlbn, daddr_t endlbn) 2654 { 2655 struct bufv *bv; 2656 struct buf *bp, *nbp; 2657 uint8_t anyfreed; 2658 bool clean; 2659 2660 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2661 ASSERT_BO_LOCKED(bo); 2662 2663 anyfreed = 1; 2664 clean = true; 2665 do { 2666 bv = clean ? &bo->bo_clean : &bo->bo_dirty; 2667 bp = buf_lookup_ge(bv, startlbn); 2668 if (bp == NULL) 2669 continue; 2670 TAILQ_FOREACH_FROM_SAFE(bp, &bv->bv_hd, b_bobufs, nbp) { 2671 if (bp->b_lblkno >= endlbn) 2672 break; 2673 if (BUF_LOCK(bp, 2674 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2675 BO_LOCKPTR(bo)) == ENOLCK) { 2676 BO_LOCK(bo); 2677 return (EAGAIN); 2678 } 2679 2680 bremfree(bp); 2681 bp->b_flags |= B_INVAL | B_RELBUF; 2682 bp->b_flags &= ~B_ASYNC; 2683 brelse(bp); 2684 anyfreed = 2; 2685 2686 BO_LOCK(bo); 2687 if (nbp != NULL && 2688 (((nbp->b_xflags & 2689 (clean ? BX_VNCLEAN : BX_VNDIRTY)) == 0) || 2690 nbp->b_vp != vp || 2691 (nbp->b_flags & B_DELWRI) == (clean? B_DELWRI: 0))) 2692 return (EAGAIN); 2693 } 2694 } while (clean = !clean, anyfreed-- > 0); 2695 return (0); 2696 } 2697 2698 static void 2699 buf_vlist_remove(struct buf *bp) 2700 { 2701 struct bufv *bv; 2702 b_xflags_t flags; 2703 2704 flags = bp->b_xflags; 2705 2706 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2707 ASSERT_BO_WLOCKED(bp->b_bufobj); 2708 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2709 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2710 ("%s: buffer %p has invalid queue state", __func__, bp)); 2711 2712 if ((flags & BX_VNDIRTY) != 0) 2713 bv = &bp->b_bufobj->bo_dirty; 2714 else 2715 bv = &bp->b_bufobj->bo_clean; 2716 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2717 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2718 bv->bv_cnt--; 2719 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2720 } 2721 2722 /* 2723 * Add the buffer to the sorted clean or dirty block list. Return zero on 2724 * success, EEXIST if a buffer with this identity already exists, or another 2725 * error on allocation failure. 2726 */ 2727 static inline int 2728 buf_vlist_find_or_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2729 { 2730 struct bufv *bv; 2731 struct buf *n; 2732 int error; 2733 2734 ASSERT_BO_WLOCKED(bo); 2735 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2736 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2737 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2738 ("dead bo %p", bo)); 2739 KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == xflags, 2740 ("buf_vlist_add: b_xflags %#x not set on bp %p", xflags, bp)); 2741 2742 if (xflags & BX_VNDIRTY) 2743 bv = &bo->bo_dirty; 2744 else 2745 bv = &bo->bo_clean; 2746 2747 error = buf_insert_lookup_le(bv, bp, &n); 2748 if (n == NULL) { 2749 KASSERT(error != EEXIST, 2750 ("buf_vlist_add: EEXIST but no existing buf found: bp %p", 2751 bp)); 2752 } else { 2753 KASSERT(n->b_lblkno <= bp->b_lblkno, 2754 ("buf_vlist_add: out of order insert/lookup: bp %p n %p", 2755 bp, n)); 2756 KASSERT((n->b_lblkno == bp->b_lblkno) == (error == EEXIST), 2757 ("buf_vlist_add: inconsistent result for existing buf: " 2758 "error %d bp %p n %p", error, bp, n)); 2759 } 2760 if (error != 0) 2761 return (error); 2762 2763 /* Keep the list ordered. */ 2764 if (n == NULL) { 2765 KASSERT(TAILQ_EMPTY(&bv->bv_hd) || 2766 bp->b_lblkno < TAILQ_FIRST(&bv->bv_hd)->b_lblkno, 2767 ("buf_vlist_add: queue order: " 2768 "%p should be before first %p", 2769 bp, TAILQ_FIRST(&bv->bv_hd))); 2770 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2771 } else { 2772 KASSERT(TAILQ_NEXT(n, b_bobufs) == NULL || 2773 bp->b_lblkno < TAILQ_NEXT(n, b_bobufs)->b_lblkno, 2774 ("buf_vlist_add: queue order: " 2775 "%p should be before next %p", 2776 bp, TAILQ_NEXT(n, b_bobufs))); 2777 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2778 } 2779 2780 bv->bv_cnt++; 2781 return (0); 2782 } 2783 2784 /* 2785 * Add the buffer to the sorted clean or dirty block list. 2786 * 2787 * NOTE: xflags is passed as a constant, optimizing this inline function! 2788 */ 2789 static void 2790 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2791 { 2792 int error; 2793 2794 KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0, 2795 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2796 bp->b_xflags |= xflags; 2797 error = buf_vlist_find_or_add(bp, bo, xflags); 2798 if (error) 2799 panic("buf_vlist_add: error=%d", error); 2800 } 2801 2802 /* 2803 * Look up a buffer using the buffer tries. 2804 */ 2805 struct buf * 2806 gbincore(struct bufobj *bo, daddr_t lblkno) 2807 { 2808 struct buf *bp; 2809 2810 ASSERT_BO_LOCKED(bo); 2811 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2812 if (bp != NULL) 2813 return (bp); 2814 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2815 } 2816 2817 /* 2818 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2819 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2820 * stability of the result. Like other lockless lookups, the found buf may 2821 * already be invalid by the time this function returns. 2822 */ 2823 struct buf * 2824 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2825 { 2826 struct buf *bp; 2827 2828 ASSERT_BO_UNLOCKED(bo); 2829 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2830 if (bp != NULL) 2831 return (bp); 2832 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2833 } 2834 2835 /* 2836 * Associate a buffer with a vnode. 2837 */ 2838 int 2839 bgetvp(struct vnode *vp, struct buf *bp) 2840 { 2841 struct bufobj *bo; 2842 int error; 2843 2844 bo = &vp->v_bufobj; 2845 ASSERT_BO_UNLOCKED(bo); 2846 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2847 2848 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2849 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2850 ("bgetvp: bp already attached! %p", bp)); 2851 2852 /* 2853 * Add the buf to the vnode's clean list unless we lost a race and find 2854 * an existing buf in either dirty or clean. 2855 */ 2856 bp->b_vp = vp; 2857 bp->b_bufobj = bo; 2858 bp->b_xflags |= BX_VNCLEAN; 2859 error = EEXIST; 2860 BO_LOCK(bo); 2861 if (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, bp->b_lblkno) == NULL) 2862 error = buf_vlist_find_or_add(bp, bo, BX_VNCLEAN); 2863 BO_UNLOCK(bo); 2864 if (__predict_true(error == 0)) { 2865 vhold(vp); 2866 return (0); 2867 } 2868 if (error != EEXIST) 2869 panic("bgetvp: buf_vlist_add error: %d", error); 2870 bp->b_vp = NULL; 2871 bp->b_bufobj = NULL; 2872 bp->b_xflags &= ~BX_VNCLEAN; 2873 return (error); 2874 } 2875 2876 /* 2877 * Disassociate a buffer from a vnode. 2878 */ 2879 void 2880 brelvp(struct buf *bp) 2881 { 2882 struct bufobj *bo; 2883 struct vnode *vp; 2884 2885 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2886 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2887 2888 /* 2889 * Delete from old vnode list, if on one. 2890 */ 2891 vp = bp->b_vp; /* XXX */ 2892 bo = bp->b_bufobj; 2893 BO_LOCK(bo); 2894 buf_vlist_remove(bp); 2895 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2896 bo->bo_flag &= ~BO_ONWORKLST; 2897 mtx_lock(&sync_mtx); 2898 LIST_REMOVE(bo, bo_synclist); 2899 syncer_worklist_len--; 2900 mtx_unlock(&sync_mtx); 2901 } 2902 bp->b_vp = NULL; 2903 bp->b_bufobj = NULL; 2904 BO_UNLOCK(bo); 2905 vdrop(vp); 2906 } 2907 2908 /* 2909 * Add an item to the syncer work queue. 2910 */ 2911 static void 2912 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2913 { 2914 int slot; 2915 2916 ASSERT_BO_WLOCKED(bo); 2917 2918 mtx_lock(&sync_mtx); 2919 if (bo->bo_flag & BO_ONWORKLST) 2920 LIST_REMOVE(bo, bo_synclist); 2921 else { 2922 bo->bo_flag |= BO_ONWORKLST; 2923 syncer_worklist_len++; 2924 } 2925 2926 if (delay > syncer_maxdelay - 2) 2927 delay = syncer_maxdelay - 2; 2928 slot = (syncer_delayno + delay) & syncer_mask; 2929 2930 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2931 mtx_unlock(&sync_mtx); 2932 } 2933 2934 static int 2935 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2936 { 2937 int error, len; 2938 2939 mtx_lock(&sync_mtx); 2940 len = syncer_worklist_len - sync_vnode_count; 2941 mtx_unlock(&sync_mtx); 2942 error = SYSCTL_OUT(req, &len, sizeof(len)); 2943 return (error); 2944 } 2945 2946 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2947 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2948 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2949 2950 static struct proc *updateproc; 2951 static void sched_sync(void); 2952 static struct kproc_desc up_kp = { 2953 "syncer", 2954 sched_sync, 2955 &updateproc 2956 }; 2957 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2958 2959 static int 2960 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2961 { 2962 struct vnode *vp; 2963 struct mount *mp; 2964 2965 *bo = LIST_FIRST(slp); 2966 if (*bo == NULL) 2967 return (0); 2968 vp = bo2vnode(*bo); 2969 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2970 return (1); 2971 /* 2972 * We use vhold in case the vnode does not 2973 * successfully sync. vhold prevents the vnode from 2974 * going away when we unlock the sync_mtx so that 2975 * we can acquire the vnode interlock. 2976 */ 2977 vholdl(vp); 2978 mtx_unlock(&sync_mtx); 2979 VI_UNLOCK(vp); 2980 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2981 vdrop(vp); 2982 mtx_lock(&sync_mtx); 2983 return (*bo == LIST_FIRST(slp)); 2984 } 2985 MPASSERT(mp == NULL || (curthread->td_pflags & TDP_IGNSUSP) != 0 || 2986 (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0, mp, 2987 ("suspended mp syncing vp %p", vp)); 2988 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2989 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2990 VOP_UNLOCK(vp); 2991 vn_finished_write(mp); 2992 BO_LOCK(*bo); 2993 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2994 /* 2995 * Put us back on the worklist. The worklist 2996 * routine will remove us from our current 2997 * position and then add us back in at a later 2998 * position. 2999 */ 3000 vn_syncer_add_to_worklist(*bo, syncdelay); 3001 } 3002 BO_UNLOCK(*bo); 3003 vdrop(vp); 3004 mtx_lock(&sync_mtx); 3005 return (0); 3006 } 3007 3008 static int first_printf = 1; 3009 3010 /* 3011 * System filesystem synchronizer daemon. 3012 */ 3013 static void 3014 sched_sync(void) 3015 { 3016 struct synclist *next, *slp; 3017 struct bufobj *bo; 3018 long starttime; 3019 struct thread *td = curthread; 3020 int last_work_seen; 3021 int net_worklist_len; 3022 int syncer_final_iter; 3023 int error; 3024 3025 last_work_seen = 0; 3026 syncer_final_iter = 0; 3027 syncer_state = SYNCER_RUNNING; 3028 starttime = time_uptime; 3029 td->td_pflags |= TDP_NORUNNINGBUF; 3030 3031 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 3032 SHUTDOWN_PRI_LAST); 3033 3034 mtx_lock(&sync_mtx); 3035 for (;;) { 3036 if (syncer_state == SYNCER_FINAL_DELAY && 3037 syncer_final_iter == 0) { 3038 mtx_unlock(&sync_mtx); 3039 kproc_suspend_check(td->td_proc); 3040 mtx_lock(&sync_mtx); 3041 } 3042 net_worklist_len = syncer_worklist_len - sync_vnode_count; 3043 if (syncer_state != SYNCER_RUNNING && 3044 starttime != time_uptime) { 3045 if (first_printf) { 3046 printf("\nSyncing disks, vnodes remaining... "); 3047 first_printf = 0; 3048 } 3049 printf("%d ", net_worklist_len); 3050 } 3051 starttime = time_uptime; 3052 3053 /* 3054 * Push files whose dirty time has expired. Be careful 3055 * of interrupt race on slp queue. 3056 * 3057 * Skip over empty worklist slots when shutting down. 3058 */ 3059 do { 3060 slp = &syncer_workitem_pending[syncer_delayno]; 3061 syncer_delayno += 1; 3062 if (syncer_delayno == syncer_maxdelay) 3063 syncer_delayno = 0; 3064 next = &syncer_workitem_pending[syncer_delayno]; 3065 /* 3066 * If the worklist has wrapped since the 3067 * it was emptied of all but syncer vnodes, 3068 * switch to the FINAL_DELAY state and run 3069 * for one more second. 3070 */ 3071 if (syncer_state == SYNCER_SHUTTING_DOWN && 3072 net_worklist_len == 0 && 3073 last_work_seen == syncer_delayno) { 3074 syncer_state = SYNCER_FINAL_DELAY; 3075 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 3076 } 3077 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 3078 syncer_worklist_len > 0); 3079 3080 /* 3081 * Keep track of the last time there was anything 3082 * on the worklist other than syncer vnodes. 3083 * Return to the SHUTTING_DOWN state if any 3084 * new work appears. 3085 */ 3086 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 3087 last_work_seen = syncer_delayno; 3088 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 3089 syncer_state = SYNCER_SHUTTING_DOWN; 3090 while (!LIST_EMPTY(slp)) { 3091 error = sync_vnode(slp, &bo, td); 3092 if (error == 1) { 3093 LIST_REMOVE(bo, bo_synclist); 3094 LIST_INSERT_HEAD(next, bo, bo_synclist); 3095 continue; 3096 } 3097 3098 if (first_printf == 0) { 3099 /* 3100 * Drop the sync mutex, because some watchdog 3101 * drivers need to sleep while patting 3102 */ 3103 mtx_unlock(&sync_mtx); 3104 wdog_kern_pat(WD_LASTVAL); 3105 mtx_lock(&sync_mtx); 3106 } 3107 } 3108 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 3109 syncer_final_iter--; 3110 /* 3111 * The variable rushjob allows the kernel to speed up the 3112 * processing of the filesystem syncer process. A rushjob 3113 * value of N tells the filesystem syncer to process the next 3114 * N seconds worth of work on its queue ASAP. Currently rushjob 3115 * is used by the soft update code to speed up the filesystem 3116 * syncer process when the incore state is getting so far 3117 * ahead of the disk that the kernel memory pool is being 3118 * threatened with exhaustion. 3119 */ 3120 if (rushjob > 0) { 3121 rushjob -= 1; 3122 continue; 3123 } 3124 /* 3125 * Just sleep for a short period of time between 3126 * iterations when shutting down to allow some I/O 3127 * to happen. 3128 * 3129 * If it has taken us less than a second to process the 3130 * current work, then wait. Otherwise start right over 3131 * again. We can still lose time if any single round 3132 * takes more than two seconds, but it does not really 3133 * matter as we are just trying to generally pace the 3134 * filesystem activity. 3135 */ 3136 if (syncer_state != SYNCER_RUNNING || 3137 time_uptime == starttime) { 3138 thread_lock(td); 3139 sched_prio(td, PPAUSE); 3140 thread_unlock(td); 3141 } 3142 if (syncer_state != SYNCER_RUNNING) 3143 cv_timedwait(&sync_wakeup, &sync_mtx, 3144 hz / SYNCER_SHUTDOWN_SPEEDUP); 3145 else if (time_uptime == starttime) 3146 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 3147 } 3148 } 3149 3150 /* 3151 * Request the syncer daemon to speed up its work. 3152 * We never push it to speed up more than half of its 3153 * normal turn time, otherwise it could take over the cpu. 3154 */ 3155 int 3156 speedup_syncer(void) 3157 { 3158 int ret = 0; 3159 3160 mtx_lock(&sync_mtx); 3161 if (rushjob < syncdelay / 2) { 3162 rushjob += 1; 3163 stat_rush_requests += 1; 3164 ret = 1; 3165 } 3166 mtx_unlock(&sync_mtx); 3167 cv_broadcast(&sync_wakeup); 3168 return (ret); 3169 } 3170 3171 /* 3172 * Tell the syncer to speed up its work and run though its work 3173 * list several times, then tell it to shut down. 3174 */ 3175 static void 3176 syncer_shutdown(void *arg, int howto) 3177 { 3178 3179 if (howto & RB_NOSYNC) 3180 return; 3181 mtx_lock(&sync_mtx); 3182 syncer_state = SYNCER_SHUTTING_DOWN; 3183 rushjob = 0; 3184 mtx_unlock(&sync_mtx); 3185 cv_broadcast(&sync_wakeup); 3186 kproc_shutdown(arg, howto); 3187 } 3188 3189 void 3190 syncer_suspend(void) 3191 { 3192 3193 syncer_shutdown(updateproc, 0); 3194 } 3195 3196 void 3197 syncer_resume(void) 3198 { 3199 3200 mtx_lock(&sync_mtx); 3201 first_printf = 1; 3202 syncer_state = SYNCER_RUNNING; 3203 mtx_unlock(&sync_mtx); 3204 cv_broadcast(&sync_wakeup); 3205 kproc_resume(updateproc); 3206 } 3207 3208 /* 3209 * Move the buffer between the clean and dirty lists of its vnode. 3210 */ 3211 void 3212 reassignbuf(struct buf *bp) 3213 { 3214 struct vnode *vp; 3215 struct bufobj *bo; 3216 int delay; 3217 #ifdef INVARIANTS 3218 struct bufv *bv; 3219 #endif 3220 3221 vp = bp->b_vp; 3222 bo = bp->b_bufobj; 3223 3224 KASSERT((bp->b_flags & B_PAGING) == 0, 3225 ("%s: cannot reassign paging buffer %p", __func__, bp)); 3226 3227 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 3228 bp, bp->b_vp, bp->b_flags); 3229 3230 BO_LOCK(bo); 3231 if ((bo->bo_flag & BO_NONSTERILE) == 0) { 3232 /* 3233 * Coordinate with getblk's unlocked lookup. Make 3234 * BO_NONSTERILE visible before the first reassignbuf produces 3235 * any side effect. This could be outside the bo lock if we 3236 * used a separate atomic flag field. 3237 */ 3238 bo->bo_flag |= BO_NONSTERILE; 3239 atomic_thread_fence_rel(); 3240 } 3241 buf_vlist_remove(bp); 3242 3243 /* 3244 * If dirty, put on list of dirty buffers; otherwise insert onto list 3245 * of clean buffers. 3246 */ 3247 if (bp->b_flags & B_DELWRI) { 3248 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 3249 switch (vp->v_type) { 3250 case VDIR: 3251 delay = dirdelay; 3252 break; 3253 case VCHR: 3254 delay = metadelay; 3255 break; 3256 default: 3257 delay = filedelay; 3258 } 3259 vn_syncer_add_to_worklist(bo, delay); 3260 } 3261 buf_vlist_add(bp, bo, BX_VNDIRTY); 3262 } else { 3263 buf_vlist_add(bp, bo, BX_VNCLEAN); 3264 3265 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 3266 mtx_lock(&sync_mtx); 3267 LIST_REMOVE(bo, bo_synclist); 3268 syncer_worklist_len--; 3269 mtx_unlock(&sync_mtx); 3270 bo->bo_flag &= ~BO_ONWORKLST; 3271 } 3272 } 3273 #ifdef INVARIANTS 3274 bv = &bo->bo_clean; 3275 bp = TAILQ_FIRST(&bv->bv_hd); 3276 KASSERT(bp == NULL || bp->b_bufobj == bo, 3277 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3278 bp = TAILQ_LAST(&bv->bv_hd, buflists); 3279 KASSERT(bp == NULL || bp->b_bufobj == bo, 3280 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3281 bv = &bo->bo_dirty; 3282 bp = TAILQ_FIRST(&bv->bv_hd); 3283 KASSERT(bp == NULL || bp->b_bufobj == bo, 3284 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3285 bp = TAILQ_LAST(&bv->bv_hd, buflists); 3286 KASSERT(bp == NULL || bp->b_bufobj == bo, 3287 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3288 #endif 3289 BO_UNLOCK(bo); 3290 } 3291 3292 static void 3293 v_init_counters(struct vnode *vp) 3294 { 3295 3296 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 3297 vp, ("%s called for an initialized vnode", __FUNCTION__)); 3298 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 3299 3300 refcount_init(&vp->v_holdcnt, 1); 3301 refcount_init(&vp->v_usecount, 1); 3302 } 3303 3304 /* 3305 * Get a usecount on a vnode. 3306 * 3307 * vget and vget_finish may fail to lock the vnode if they lose a race against 3308 * it being doomed. LK_RETRY can be passed in flags to lock it anyway. 3309 * 3310 * Consumers which don't guarantee liveness of the vnode can use SMR to 3311 * try to get a reference. Note this operation can fail since the vnode 3312 * may be awaiting getting freed by the time they get to it. 3313 */ 3314 enum vgetstate 3315 vget_prep_smr(struct vnode *vp) 3316 { 3317 enum vgetstate vs; 3318 3319 VFS_SMR_ASSERT_ENTERED(); 3320 3321 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3322 vs = VGET_USECOUNT; 3323 } else { 3324 if (vhold_smr(vp)) 3325 vs = VGET_HOLDCNT; 3326 else 3327 vs = VGET_NONE; 3328 } 3329 return (vs); 3330 } 3331 3332 enum vgetstate 3333 vget_prep(struct vnode *vp) 3334 { 3335 enum vgetstate vs; 3336 3337 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3338 vs = VGET_USECOUNT; 3339 } else { 3340 vhold(vp); 3341 vs = VGET_HOLDCNT; 3342 } 3343 return (vs); 3344 } 3345 3346 void 3347 vget_abort(struct vnode *vp, enum vgetstate vs) 3348 { 3349 3350 switch (vs) { 3351 case VGET_USECOUNT: 3352 vrele(vp); 3353 break; 3354 case VGET_HOLDCNT: 3355 vdrop(vp); 3356 break; 3357 default: 3358 __assert_unreachable(); 3359 } 3360 } 3361 3362 int 3363 vget(struct vnode *vp, int flags) 3364 { 3365 enum vgetstate vs; 3366 3367 vs = vget_prep(vp); 3368 return (vget_finish(vp, flags, vs)); 3369 } 3370 3371 int 3372 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 3373 { 3374 int error; 3375 3376 if ((flags & LK_INTERLOCK) != 0) 3377 ASSERT_VI_LOCKED(vp, __func__); 3378 else 3379 ASSERT_VI_UNLOCKED(vp, __func__); 3380 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3381 VNPASS(vp->v_holdcnt > 0, vp); 3382 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3383 3384 error = vn_lock(vp, flags); 3385 if (__predict_false(error != 0)) { 3386 vget_abort(vp, vs); 3387 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 3388 vp); 3389 return (error); 3390 } 3391 3392 vget_finish_ref(vp, vs); 3393 return (0); 3394 } 3395 3396 void 3397 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 3398 { 3399 int old; 3400 3401 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3402 VNPASS(vp->v_holdcnt > 0, vp); 3403 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3404 3405 if (vs == VGET_USECOUNT) 3406 return; 3407 3408 /* 3409 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3410 * the vnode around. Otherwise someone else lended their hold count and 3411 * we have to drop ours. 3412 */ 3413 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3414 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3415 if (old != 0) { 3416 #ifdef INVARIANTS 3417 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3418 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3419 #else 3420 refcount_release(&vp->v_holdcnt); 3421 #endif 3422 } 3423 } 3424 3425 void 3426 vref(struct vnode *vp) 3427 { 3428 enum vgetstate vs; 3429 3430 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3431 vs = vget_prep(vp); 3432 vget_finish_ref(vp, vs); 3433 } 3434 3435 void 3436 vrefact(struct vnode *vp) 3437 { 3438 int old __diagused; 3439 3440 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3441 old = refcount_acquire(&vp->v_usecount); 3442 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3443 } 3444 3445 void 3446 vlazy(struct vnode *vp) 3447 { 3448 struct mount *mp; 3449 3450 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3451 3452 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3453 return; 3454 /* 3455 * We may get here for inactive routines after the vnode got doomed. 3456 */ 3457 if (VN_IS_DOOMED(vp)) 3458 return; 3459 mp = vp->v_mount; 3460 mtx_lock(&mp->mnt_listmtx); 3461 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3462 vp->v_mflag |= VMP_LAZYLIST; 3463 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3464 mp->mnt_lazyvnodelistsize++; 3465 } 3466 mtx_unlock(&mp->mnt_listmtx); 3467 } 3468 3469 static void 3470 vunlazy(struct vnode *vp) 3471 { 3472 struct mount *mp; 3473 3474 ASSERT_VI_LOCKED(vp, __func__); 3475 VNPASS(!VN_IS_DOOMED(vp), vp); 3476 3477 mp = vp->v_mount; 3478 mtx_lock(&mp->mnt_listmtx); 3479 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3480 /* 3481 * Don't remove the vnode from the lazy list if another thread 3482 * has increased the hold count. It may have re-enqueued the 3483 * vnode to the lazy list and is now responsible for its 3484 * removal. 3485 */ 3486 if (vp->v_holdcnt == 0) { 3487 vp->v_mflag &= ~VMP_LAZYLIST; 3488 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3489 mp->mnt_lazyvnodelistsize--; 3490 } 3491 mtx_unlock(&mp->mnt_listmtx); 3492 } 3493 3494 /* 3495 * This routine is only meant to be called from vgonel prior to dooming 3496 * the vnode. 3497 */ 3498 static void 3499 vunlazy_gone(struct vnode *vp) 3500 { 3501 struct mount *mp; 3502 3503 ASSERT_VOP_ELOCKED(vp, __func__); 3504 ASSERT_VI_LOCKED(vp, __func__); 3505 VNPASS(!VN_IS_DOOMED(vp), vp); 3506 3507 if (vp->v_mflag & VMP_LAZYLIST) { 3508 mp = vp->v_mount; 3509 mtx_lock(&mp->mnt_listmtx); 3510 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3511 vp->v_mflag &= ~VMP_LAZYLIST; 3512 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3513 mp->mnt_lazyvnodelistsize--; 3514 mtx_unlock(&mp->mnt_listmtx); 3515 } 3516 } 3517 3518 static void 3519 vdefer_inactive(struct vnode *vp) 3520 { 3521 3522 ASSERT_VI_LOCKED(vp, __func__); 3523 VNPASS(vp->v_holdcnt > 0, vp); 3524 if (VN_IS_DOOMED(vp)) { 3525 vdropl(vp); 3526 return; 3527 } 3528 if (vp->v_iflag & VI_DEFINACT) { 3529 VNPASS(vp->v_holdcnt > 1, vp); 3530 vdropl(vp); 3531 return; 3532 } 3533 if (vp->v_usecount > 0) { 3534 vp->v_iflag &= ~VI_OWEINACT; 3535 vdropl(vp); 3536 return; 3537 } 3538 vlazy(vp); 3539 vp->v_iflag |= VI_DEFINACT; 3540 VI_UNLOCK(vp); 3541 atomic_add_long(&deferred_inact, 1); 3542 } 3543 3544 static void 3545 vdefer_inactive_unlocked(struct vnode *vp) 3546 { 3547 3548 VI_LOCK(vp); 3549 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3550 vdropl(vp); 3551 return; 3552 } 3553 vdefer_inactive(vp); 3554 } 3555 3556 enum vput_op { VRELE, VPUT, VUNREF }; 3557 3558 /* 3559 * Handle ->v_usecount transitioning to 0. 3560 * 3561 * By releasing the last usecount we take ownership of the hold count which 3562 * provides liveness of the vnode, meaning we have to vdrop. 3563 * 3564 * For all vnodes we may need to perform inactive processing. It requires an 3565 * exclusive lock on the vnode, while it is legal to call here with only a 3566 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3567 * inactive processing gets deferred to the syncer. 3568 * 3569 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3570 * on the lock being held all the way until VOP_INACTIVE. This in particular 3571 * happens with UFS which adds half-constructed vnodes to the hash, where they 3572 * can be found by other code. 3573 */ 3574 static void 3575 vput_final(struct vnode *vp, enum vput_op func) 3576 { 3577 int error; 3578 bool want_unlock; 3579 3580 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3581 VNPASS(vp->v_holdcnt > 0, vp); 3582 3583 VI_LOCK(vp); 3584 3585 /* 3586 * By the time we got here someone else might have transitioned 3587 * the count back to > 0. 3588 */ 3589 if (vp->v_usecount > 0) 3590 goto out; 3591 3592 /* 3593 * If the vnode is doomed vgone already performed inactive processing 3594 * (if needed). 3595 */ 3596 if (VN_IS_DOOMED(vp)) 3597 goto out; 3598 3599 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3600 goto out; 3601 3602 if (vp->v_iflag & VI_DOINGINACT) 3603 goto out; 3604 3605 /* 3606 * Locking operations here will drop the interlock and possibly the 3607 * vnode lock, opening a window where the vnode can get doomed all the 3608 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3609 * perform inactive. 3610 */ 3611 vp->v_iflag |= VI_OWEINACT; 3612 want_unlock = false; 3613 error = 0; 3614 switch (func) { 3615 case VRELE: 3616 switch (VOP_ISLOCKED(vp)) { 3617 case LK_EXCLUSIVE: 3618 break; 3619 case LK_EXCLOTHER: 3620 case 0: 3621 want_unlock = true; 3622 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3623 VI_LOCK(vp); 3624 break; 3625 default: 3626 /* 3627 * The lock has at least one sharer, but we have no way 3628 * to conclude whether this is us. Play it safe and 3629 * defer processing. 3630 */ 3631 error = EAGAIN; 3632 break; 3633 } 3634 break; 3635 case VPUT: 3636 want_unlock = true; 3637 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3638 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3639 LK_NOWAIT); 3640 VI_LOCK(vp); 3641 } 3642 break; 3643 case VUNREF: 3644 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3645 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3646 VI_LOCK(vp); 3647 } 3648 break; 3649 } 3650 if (error == 0) { 3651 if (func == VUNREF) { 3652 VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, 3653 ("recursive vunref")); 3654 vp->v_vflag |= VV_UNREF; 3655 } 3656 for (;;) { 3657 error = vinactive(vp); 3658 if (want_unlock) 3659 VOP_UNLOCK(vp); 3660 if (error != ERELOOKUP || !want_unlock) 3661 break; 3662 VOP_LOCK(vp, LK_EXCLUSIVE); 3663 } 3664 if (func == VUNREF) 3665 vp->v_vflag &= ~VV_UNREF; 3666 vdropl(vp); 3667 } else { 3668 vdefer_inactive(vp); 3669 } 3670 return; 3671 out: 3672 if (func == VPUT) 3673 VOP_UNLOCK(vp); 3674 vdropl(vp); 3675 } 3676 3677 /* 3678 * Decrement ->v_usecount for a vnode. 3679 * 3680 * Releasing the last use count requires additional processing, see vput_final 3681 * above for details. 3682 * 3683 * Comment above each variant denotes lock state on entry and exit. 3684 */ 3685 3686 /* 3687 * in: any 3688 * out: same as passed in 3689 */ 3690 void 3691 vrele(struct vnode *vp) 3692 { 3693 3694 ASSERT_VI_UNLOCKED(vp, __func__); 3695 if (!refcount_release(&vp->v_usecount)) 3696 return; 3697 vput_final(vp, VRELE); 3698 } 3699 3700 /* 3701 * in: locked 3702 * out: unlocked 3703 */ 3704 void 3705 vput(struct vnode *vp) 3706 { 3707 3708 ASSERT_VOP_LOCKED(vp, __func__); 3709 ASSERT_VI_UNLOCKED(vp, __func__); 3710 if (!refcount_release(&vp->v_usecount)) { 3711 VOP_UNLOCK(vp); 3712 return; 3713 } 3714 vput_final(vp, VPUT); 3715 } 3716 3717 /* 3718 * in: locked 3719 * out: locked 3720 */ 3721 void 3722 vunref(struct vnode *vp) 3723 { 3724 3725 ASSERT_VOP_LOCKED(vp, __func__); 3726 ASSERT_VI_UNLOCKED(vp, __func__); 3727 if (!refcount_release(&vp->v_usecount)) 3728 return; 3729 vput_final(vp, VUNREF); 3730 } 3731 3732 void 3733 vhold(struct vnode *vp) 3734 { 3735 int old; 3736 3737 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3738 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3739 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3740 ("%s: wrong hold count %d", __func__, old)); 3741 if (old == 0) 3742 vfs_freevnodes_dec(); 3743 } 3744 3745 void 3746 vholdnz(struct vnode *vp) 3747 { 3748 3749 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3750 #ifdef INVARIANTS 3751 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3752 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3753 ("%s: wrong hold count %d", __func__, old)); 3754 #else 3755 atomic_add_int(&vp->v_holdcnt, 1); 3756 #endif 3757 } 3758 3759 /* 3760 * Grab a hold count unless the vnode is freed. 3761 * 3762 * Only use this routine if vfs smr is the only protection you have against 3763 * freeing the vnode. 3764 * 3765 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3766 * is not set. After the flag is set the vnode becomes immutable to anyone but 3767 * the thread which managed to set the flag. 3768 * 3769 * It may be tempting to replace the loop with: 3770 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3771 * if (count & VHOLD_NO_SMR) { 3772 * backpedal and error out; 3773 * } 3774 * 3775 * However, while this is more performant, it hinders debugging by eliminating 3776 * the previously mentioned invariant. 3777 */ 3778 bool 3779 vhold_smr(struct vnode *vp) 3780 { 3781 int count; 3782 3783 VFS_SMR_ASSERT_ENTERED(); 3784 3785 count = atomic_load_int(&vp->v_holdcnt); 3786 for (;;) { 3787 if (count & VHOLD_NO_SMR) { 3788 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3789 ("non-zero hold count with flags %d\n", count)); 3790 return (false); 3791 } 3792 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3793 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3794 if (count == 0) 3795 vfs_freevnodes_dec(); 3796 return (true); 3797 } 3798 } 3799 } 3800 3801 /* 3802 * Hold a free vnode for recycling. 3803 * 3804 * Note: vnode_init references this comment. 3805 * 3806 * Attempts to recycle only need the global vnode list lock and have no use for 3807 * SMR. 3808 * 3809 * However, vnodes get inserted into the global list before they get fully 3810 * initialized and stay there until UMA decides to free the memory. This in 3811 * particular means the target can be found before it becomes usable and after 3812 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to 3813 * VHOLD_NO_SMR. 3814 * 3815 * Note: the vnode may gain more references after we transition the count 0->1. 3816 */ 3817 static bool 3818 vhold_recycle_free(struct vnode *vp) 3819 { 3820 int count; 3821 3822 mtx_assert(&vnode_list_mtx, MA_OWNED); 3823 3824 count = atomic_load_int(&vp->v_holdcnt); 3825 for (;;) { 3826 if (count & VHOLD_NO_SMR) { 3827 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3828 ("non-zero hold count with flags %d\n", count)); 3829 return (false); 3830 } 3831 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3832 if (count > 0) { 3833 return (false); 3834 } 3835 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3836 vfs_freevnodes_dec(); 3837 return (true); 3838 } 3839 } 3840 } 3841 3842 static void __noinline 3843 vdbatch_process(struct vdbatch *vd) 3844 { 3845 struct vnode *vp; 3846 int i; 3847 3848 mtx_assert(&vd->lock, MA_OWNED); 3849 MPASS(curthread->td_pinned > 0); 3850 MPASS(vd->index == VDBATCH_SIZE); 3851 3852 /* 3853 * Attempt to requeue the passed batch, but give up easily. 3854 * 3855 * Despite batching the mechanism is prone to transient *significant* 3856 * lock contention, where vnode_list_mtx becomes the primary bottleneck 3857 * if multiple CPUs get here (one real-world example is highly parallel 3858 * do-nothing make , which will stat *tons* of vnodes). Since it is 3859 * quasi-LRU (read: not that great even if fully honoured) provide an 3860 * option to just dodge the problem. Parties which don't like it are 3861 * welcome to implement something better. 3862 */ 3863 if (vnode_can_skip_requeue) { 3864 if (!mtx_trylock(&vnode_list_mtx)) { 3865 counter_u64_add(vnode_skipped_requeues, 1); 3866 critical_enter(); 3867 for (i = 0; i < VDBATCH_SIZE; i++) { 3868 vp = vd->tab[i]; 3869 vd->tab[i] = NULL; 3870 MPASS(vp->v_dbatchcpu != NOCPU); 3871 vp->v_dbatchcpu = NOCPU; 3872 } 3873 vd->index = 0; 3874 critical_exit(); 3875 return; 3876 3877 } 3878 /* fallthrough to locked processing */ 3879 } else { 3880 mtx_lock(&vnode_list_mtx); 3881 } 3882 3883 mtx_assert(&vnode_list_mtx, MA_OWNED); 3884 critical_enter(); 3885 for (i = 0; i < VDBATCH_SIZE; i++) { 3886 vp = vd->tab[i]; 3887 vd->tab[i] = NULL; 3888 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3889 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3890 MPASS(vp->v_dbatchcpu != NOCPU); 3891 vp->v_dbatchcpu = NOCPU; 3892 } 3893 mtx_unlock(&vnode_list_mtx); 3894 vd->index = 0; 3895 critical_exit(); 3896 } 3897 3898 static void 3899 vdbatch_enqueue(struct vnode *vp) 3900 { 3901 struct vdbatch *vd; 3902 3903 ASSERT_VI_LOCKED(vp, __func__); 3904 VNPASS(!VN_IS_DOOMED(vp), vp); 3905 3906 if (vp->v_dbatchcpu != NOCPU) { 3907 VI_UNLOCK(vp); 3908 return; 3909 } 3910 3911 sched_pin(); 3912 vd = DPCPU_PTR(vd); 3913 mtx_lock(&vd->lock); 3914 MPASS(vd->index < VDBATCH_SIZE); 3915 MPASS(vd->tab[vd->index] == NULL); 3916 /* 3917 * A hack: we depend on being pinned so that we know what to put in 3918 * ->v_dbatchcpu. 3919 */ 3920 vp->v_dbatchcpu = curcpu; 3921 vd->tab[vd->index] = vp; 3922 vd->index++; 3923 VI_UNLOCK(vp); 3924 if (vd->index == VDBATCH_SIZE) 3925 vdbatch_process(vd); 3926 mtx_unlock(&vd->lock); 3927 sched_unpin(); 3928 } 3929 3930 /* 3931 * This routine must only be called for vnodes which are about to be 3932 * deallocated. Supporting dequeue for arbitrary vndoes would require 3933 * validating that the locked batch matches. 3934 */ 3935 static void 3936 vdbatch_dequeue(struct vnode *vp) 3937 { 3938 struct vdbatch *vd; 3939 int i; 3940 short cpu; 3941 3942 VNPASS(vp->v_type == VBAD || vp->v_type == VNON, vp); 3943 3944 cpu = vp->v_dbatchcpu; 3945 if (cpu == NOCPU) 3946 return; 3947 3948 vd = DPCPU_ID_PTR(cpu, vd); 3949 mtx_lock(&vd->lock); 3950 for (i = 0; i < vd->index; i++) { 3951 if (vd->tab[i] != vp) 3952 continue; 3953 vp->v_dbatchcpu = NOCPU; 3954 vd->index--; 3955 vd->tab[i] = vd->tab[vd->index]; 3956 vd->tab[vd->index] = NULL; 3957 break; 3958 } 3959 mtx_unlock(&vd->lock); 3960 /* 3961 * Either we dequeued the vnode above or the target CPU beat us to it. 3962 */ 3963 MPASS(vp->v_dbatchcpu == NOCPU); 3964 } 3965 3966 /* 3967 * Drop the hold count of the vnode. 3968 * 3969 * It will only get freed if this is the last hold *and* it has been vgone'd. 3970 * 3971 * Because the vnode vm object keeps a hold reference on the vnode if 3972 * there is at least one resident non-cached page, the vnode cannot 3973 * leave the active list without the page cleanup done. 3974 */ 3975 static void __noinline 3976 vdropl_final(struct vnode *vp) 3977 { 3978 3979 ASSERT_VI_LOCKED(vp, __func__); 3980 VNPASS(VN_IS_DOOMED(vp), vp); 3981 /* 3982 * Set the VHOLD_NO_SMR flag. 3983 * 3984 * We may be racing against vhold_smr. If they win we can just pretend 3985 * we never got this far, they will vdrop later. 3986 */ 3987 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { 3988 vfs_freevnodes_inc(); 3989 VI_UNLOCK(vp); 3990 /* 3991 * We lost the aforementioned race. Any subsequent access is 3992 * invalid as they might have managed to vdropl on their own. 3993 */ 3994 return; 3995 } 3996 /* 3997 * Don't bump freevnodes as this one is going away. 3998 */ 3999 freevnode(vp); 4000 } 4001 4002 void 4003 vdrop(struct vnode *vp) 4004 { 4005 4006 ASSERT_VI_UNLOCKED(vp, __func__); 4007 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4008 if (refcount_release_if_not_last(&vp->v_holdcnt)) 4009 return; 4010 VI_LOCK(vp); 4011 vdropl(vp); 4012 } 4013 4014 static __always_inline void 4015 vdropl_impl(struct vnode *vp, bool enqueue) 4016 { 4017 4018 ASSERT_VI_LOCKED(vp, __func__); 4019 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4020 if (!refcount_release(&vp->v_holdcnt)) { 4021 VI_UNLOCK(vp); 4022 return; 4023 } 4024 VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); 4025 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 4026 if (VN_IS_DOOMED(vp)) { 4027 vdropl_final(vp); 4028 return; 4029 } 4030 4031 vfs_freevnodes_inc(); 4032 if (vp->v_mflag & VMP_LAZYLIST) { 4033 vunlazy(vp); 4034 } 4035 4036 if (!enqueue) { 4037 VI_UNLOCK(vp); 4038 return; 4039 } 4040 4041 /* 4042 * Also unlocks the interlock. We can't assert on it as we 4043 * released our hold and by now the vnode might have been 4044 * freed. 4045 */ 4046 vdbatch_enqueue(vp); 4047 } 4048 4049 void 4050 vdropl(struct vnode *vp) 4051 { 4052 4053 vdropl_impl(vp, true); 4054 } 4055 4056 /* 4057 * vdrop a vnode when recycling 4058 * 4059 * This is a special case routine only to be used when recycling, differs from 4060 * regular vdrop by not requeieing the vnode on LRU. 4061 * 4062 * Consider a case where vtryrecycle continuously fails with all vnodes (due to 4063 * e.g., frozen writes on the filesystem), filling the batch and causing it to 4064 * be requeued. Then vnlru will end up revisiting the same vnodes. This is a 4065 * loop which can last for as long as writes are frozen. 4066 */ 4067 static void 4068 vdropl_recycle(struct vnode *vp) 4069 { 4070 4071 vdropl_impl(vp, false); 4072 } 4073 4074 static void 4075 vdrop_recycle(struct vnode *vp) 4076 { 4077 4078 VI_LOCK(vp); 4079 vdropl_recycle(vp); 4080 } 4081 4082 /* 4083 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 4084 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 4085 */ 4086 static int 4087 vinactivef(struct vnode *vp) 4088 { 4089 int error; 4090 4091 ASSERT_VOP_ELOCKED(vp, "vinactive"); 4092 ASSERT_VI_LOCKED(vp, "vinactive"); 4093 VNPASS((vp->v_iflag & VI_DOINGINACT) == 0, vp); 4094 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4095 vp->v_iflag |= VI_DOINGINACT; 4096 vp->v_iflag &= ~VI_OWEINACT; 4097 VI_UNLOCK(vp); 4098 4099 /* 4100 * Before moving off the active list, we must be sure that any 4101 * modified pages are converted into the vnode's dirty 4102 * buffers, since these will no longer be checked once the 4103 * vnode is on the inactive list. 4104 * 4105 * The write-out of the dirty pages is asynchronous. At the 4106 * point that VOP_INACTIVE() is called, there could still be 4107 * pending I/O and dirty pages in the object. 4108 */ 4109 if ((vp->v_vflag & VV_NOSYNC) == 0) 4110 vnode_pager_clean_async(vp); 4111 4112 error = VOP_INACTIVE(vp); 4113 VI_LOCK(vp); 4114 VNPASS(vp->v_iflag & VI_DOINGINACT, vp); 4115 vp->v_iflag &= ~VI_DOINGINACT; 4116 return (error); 4117 } 4118 4119 int 4120 vinactive(struct vnode *vp) 4121 { 4122 4123 ASSERT_VOP_ELOCKED(vp, "vinactive"); 4124 ASSERT_VI_LOCKED(vp, "vinactive"); 4125 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4126 4127 if ((vp->v_iflag & VI_OWEINACT) == 0) 4128 return (0); 4129 if (vp->v_iflag & VI_DOINGINACT) 4130 return (0); 4131 if (vp->v_usecount > 0) { 4132 vp->v_iflag &= ~VI_OWEINACT; 4133 return (0); 4134 } 4135 return (vinactivef(vp)); 4136 } 4137 4138 /* 4139 * Remove any vnodes in the vnode table belonging to mount point mp. 4140 * 4141 * If FORCECLOSE is not specified, there should not be any active ones, 4142 * return error if any are found (nb: this is a user error, not a 4143 * system error). If FORCECLOSE is specified, detach any active vnodes 4144 * that are found. 4145 * 4146 * If WRITECLOSE is set, only flush out regular file vnodes open for 4147 * writing. 4148 * 4149 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 4150 * 4151 * `rootrefs' specifies the base reference count for the root vnode 4152 * of this filesystem. The root vnode is considered busy if its 4153 * v_usecount exceeds this value. On a successful return, vflush(, td) 4154 * will call vrele() on the root vnode exactly rootrefs times. 4155 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 4156 * be zero. 4157 */ 4158 #ifdef DIAGNOSTIC 4159 static int busyprt = 0; /* print out busy vnodes */ 4160 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 4161 #endif 4162 4163 int 4164 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 4165 { 4166 struct vnode *vp, *mvp, *rootvp = NULL; 4167 struct vattr vattr; 4168 int busy = 0, error; 4169 4170 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 4171 rootrefs, flags); 4172 if (rootrefs > 0) { 4173 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 4174 ("vflush: bad args")); 4175 /* 4176 * Get the filesystem root vnode. We can vput() it 4177 * immediately, since with rootrefs > 0, it won't go away. 4178 */ 4179 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 4180 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 4181 __func__, error); 4182 return (error); 4183 } 4184 vput(rootvp); 4185 } 4186 loop: 4187 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 4188 vholdl(vp); 4189 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 4190 if (error) { 4191 vdrop(vp); 4192 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 4193 goto loop; 4194 } 4195 /* 4196 * Skip over a vnodes marked VV_SYSTEM. 4197 */ 4198 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 4199 VOP_UNLOCK(vp); 4200 vdrop(vp); 4201 continue; 4202 } 4203 /* 4204 * If WRITECLOSE is set, flush out unlinked but still open 4205 * files (even if open only for reading) and regular file 4206 * vnodes open for writing. 4207 */ 4208 if (flags & WRITECLOSE) { 4209 vnode_pager_clean_async(vp); 4210 do { 4211 error = VOP_FSYNC(vp, MNT_WAIT, td); 4212 } while (error == ERELOOKUP); 4213 if (error != 0) { 4214 VOP_UNLOCK(vp); 4215 vdrop(vp); 4216 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 4217 return (error); 4218 } 4219 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 4220 VI_LOCK(vp); 4221 4222 if ((vp->v_type == VNON || 4223 (error == 0 && vattr.va_nlink > 0)) && 4224 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 4225 VOP_UNLOCK(vp); 4226 vdropl(vp); 4227 continue; 4228 } 4229 } else 4230 VI_LOCK(vp); 4231 /* 4232 * With v_usecount == 0, all we need to do is clear out the 4233 * vnode data structures and we are done. 4234 * 4235 * If FORCECLOSE is set, forcibly close the vnode. 4236 */ 4237 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 4238 vgonel(vp); 4239 } else { 4240 busy++; 4241 #ifdef DIAGNOSTIC 4242 if (busyprt) 4243 vn_printf(vp, "vflush: busy vnode "); 4244 #endif 4245 } 4246 VOP_UNLOCK(vp); 4247 vdropl(vp); 4248 } 4249 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 4250 /* 4251 * If just the root vnode is busy, and if its refcount 4252 * is equal to `rootrefs', then go ahead and kill it. 4253 */ 4254 VI_LOCK(rootvp); 4255 KASSERT(busy > 0, ("vflush: not busy")); 4256 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 4257 ("vflush: usecount %d < rootrefs %d", 4258 rootvp->v_usecount, rootrefs)); 4259 if (busy == 1 && rootvp->v_usecount == rootrefs) { 4260 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 4261 vgone(rootvp); 4262 VOP_UNLOCK(rootvp); 4263 busy = 0; 4264 } else 4265 VI_UNLOCK(rootvp); 4266 } 4267 if (busy) { 4268 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 4269 busy); 4270 return (EBUSY); 4271 } 4272 for (; rootrefs > 0; rootrefs--) 4273 vrele(rootvp); 4274 return (0); 4275 } 4276 4277 /* 4278 * Recycle an unused vnode. 4279 */ 4280 int 4281 vrecycle(struct vnode *vp) 4282 { 4283 int recycled; 4284 4285 VI_LOCK(vp); 4286 recycled = vrecyclel(vp); 4287 VI_UNLOCK(vp); 4288 return (recycled); 4289 } 4290 4291 /* 4292 * vrecycle, with the vp interlock held. 4293 */ 4294 int 4295 vrecyclel(struct vnode *vp) 4296 { 4297 int recycled; 4298 4299 ASSERT_VOP_ELOCKED(vp, __func__); 4300 ASSERT_VI_LOCKED(vp, __func__); 4301 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4302 recycled = 0; 4303 if (vp->v_usecount == 0) { 4304 recycled = 1; 4305 vgonel(vp); 4306 } 4307 return (recycled); 4308 } 4309 4310 /* 4311 * Eliminate all activity associated with a vnode 4312 * in preparation for reuse. 4313 */ 4314 void 4315 vgone(struct vnode *vp) 4316 { 4317 VI_LOCK(vp); 4318 vgonel(vp); 4319 VI_UNLOCK(vp); 4320 } 4321 4322 /* 4323 * Notify upper mounts about reclaimed or unlinked vnode. 4324 */ 4325 void 4326 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) 4327 { 4328 struct mount *mp; 4329 struct mount_upper_node *ump; 4330 4331 mp = atomic_load_ptr(&vp->v_mount); 4332 if (mp == NULL) 4333 return; 4334 if (TAILQ_EMPTY(&mp->mnt_notify)) 4335 return; 4336 4337 MNT_ILOCK(mp); 4338 mp->mnt_upper_pending++; 4339 KASSERT(mp->mnt_upper_pending > 0, 4340 ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); 4341 TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { 4342 MNT_IUNLOCK(mp); 4343 switch (event) { 4344 case VFS_NOTIFY_UPPER_RECLAIM: 4345 VFS_RECLAIM_LOWERVP(ump->mp, vp); 4346 break; 4347 case VFS_NOTIFY_UPPER_UNLINK: 4348 VFS_UNLINK_LOWERVP(ump->mp, vp); 4349 break; 4350 } 4351 MNT_ILOCK(mp); 4352 } 4353 mp->mnt_upper_pending--; 4354 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 4355 mp->mnt_upper_pending == 0) { 4356 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 4357 wakeup(&mp->mnt_uppers); 4358 } 4359 MNT_IUNLOCK(mp); 4360 } 4361 4362 /* 4363 * vgone, with the vp interlock held. 4364 */ 4365 static void 4366 vgonel(struct vnode *vp) 4367 { 4368 struct thread *td; 4369 struct mount *mp; 4370 vm_object_t object; 4371 bool active, doinginact, oweinact; 4372 4373 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4374 ASSERT_VI_LOCKED(vp, "vgonel"); 4375 VNASSERT(vp->v_holdcnt, vp, 4376 ("vgonel: vp %p has no reference.", vp)); 4377 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4378 td = curthread; 4379 4380 /* 4381 * Don't vgonel if we're already doomed. 4382 */ 4383 if (VN_IS_DOOMED(vp)) { 4384 VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \ 4385 vn_get_state(vp) == VSTATE_DEAD, vp); 4386 return; 4387 } 4388 /* 4389 * Paired with freevnode. 4390 */ 4391 vn_seqc_write_begin_locked(vp); 4392 vunlazy_gone(vp); 4393 vn_irflag_set_locked(vp, VIRF_DOOMED); 4394 vn_set_state(vp, VSTATE_DESTROYING); 4395 4396 /* 4397 * Check to see if the vnode is in use. If so, we have to 4398 * call VOP_CLOSE() and VOP_INACTIVE(). 4399 * 4400 * It could be that VOP_INACTIVE() requested reclamation, in 4401 * which case we should avoid recursion, so check 4402 * VI_DOINGINACT. This is not precise but good enough. 4403 */ 4404 active = vp->v_usecount > 0; 4405 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4406 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; 4407 4408 /* 4409 * If we need to do inactive VI_OWEINACT will be set. 4410 */ 4411 if (vp->v_iflag & VI_DEFINACT) { 4412 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4413 vp->v_iflag &= ~VI_DEFINACT; 4414 vdropl(vp); 4415 } else { 4416 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4417 VI_UNLOCK(vp); 4418 } 4419 cache_purge_vgone(vp); 4420 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4421 4422 /* 4423 * If purging an active vnode, it must be closed and 4424 * deactivated before being reclaimed. 4425 */ 4426 if (active) 4427 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4428 if (!doinginact) { 4429 do { 4430 if (oweinact || active) { 4431 VI_LOCK(vp); 4432 vinactivef(vp); 4433 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4434 VI_UNLOCK(vp); 4435 } 4436 } while (oweinact); 4437 } 4438 if (vp->v_type == VSOCK) 4439 vfs_unp_reclaim(vp); 4440 4441 /* 4442 * Clean out any buffers associated with the vnode. 4443 * If the flush fails, just toss the buffers. 4444 */ 4445 mp = NULL; 4446 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4447 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4448 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4449 while (vinvalbuf(vp, 0, 0, 0) != 0) 4450 ; 4451 } 4452 4453 BO_LOCK(&vp->v_bufobj); 4454 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4455 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4456 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4457 vp->v_bufobj.bo_clean.bv_cnt == 0, 4458 ("vp %p bufobj not invalidated", vp)); 4459 4460 /* 4461 * For VMIO bufobj, BO_DEAD is set later, or in 4462 * vm_object_terminate() after the object's page queue is 4463 * flushed. 4464 */ 4465 object = vp->v_bufobj.bo_object; 4466 if (object == NULL) 4467 vp->v_bufobj.bo_flag |= BO_DEAD; 4468 BO_UNLOCK(&vp->v_bufobj); 4469 4470 /* 4471 * Handle the VM part. Tmpfs handles v_object on its own (the 4472 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4473 * should not touch the object borrowed from the lower vnode 4474 * (the handle check). 4475 */ 4476 if (object != NULL && object->type == OBJT_VNODE && 4477 object->handle == vp) 4478 vnode_destroy_vobject(vp); 4479 4480 /* 4481 * Reclaim the vnode. 4482 */ 4483 if (VOP_RECLAIM(vp)) 4484 panic("vgone: cannot reclaim"); 4485 if (mp != NULL) 4486 vn_finished_secondary_write(mp); 4487 VNASSERT(vp->v_object == NULL, vp, 4488 ("vop_reclaim left v_object vp=%p", vp)); 4489 /* 4490 * Clear the advisory locks and wake up waiting threads. 4491 */ 4492 if (vp->v_lockf != NULL) { 4493 (void)VOP_ADVLOCKPURGE(vp); 4494 vp->v_lockf = NULL; 4495 } 4496 /* 4497 * Delete from old mount point vnode list. 4498 */ 4499 if (vp->v_mount == NULL) { 4500 VI_LOCK(vp); 4501 } else { 4502 delmntque(vp); 4503 ASSERT_VI_LOCKED(vp, "vgonel 2"); 4504 } 4505 /* 4506 * Done with purge, reset to the standard lock and invalidate 4507 * the vnode. 4508 */ 4509 vp->v_vnlock = &vp->v_lock; 4510 vp->v_op = &dead_vnodeops; 4511 vp->v_type = VBAD; 4512 vn_set_state(vp, VSTATE_DEAD); 4513 } 4514 4515 /* 4516 * Print out a description of a vnode. 4517 */ 4518 static const char *const vtypename[] = { 4519 [VNON] = "VNON", 4520 [VREG] = "VREG", 4521 [VDIR] = "VDIR", 4522 [VBLK] = "VBLK", 4523 [VCHR] = "VCHR", 4524 [VLNK] = "VLNK", 4525 [VSOCK] = "VSOCK", 4526 [VFIFO] = "VFIFO", 4527 [VBAD] = "VBAD", 4528 [VMARKER] = "VMARKER", 4529 }; 4530 _Static_assert(nitems(vtypename) == VLASTTYPE + 1, 4531 "vnode type name not added to vtypename"); 4532 4533 static const char *const vstatename[] = { 4534 [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED", 4535 [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED", 4536 [VSTATE_DESTROYING] = "VSTATE_DESTROYING", 4537 [VSTATE_DEAD] = "VSTATE_DEAD", 4538 }; 4539 _Static_assert(nitems(vstatename) == VLASTSTATE + 1, 4540 "vnode state name not added to vstatename"); 4541 4542 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4543 "new hold count flag not added to vn_printf"); 4544 4545 void 4546 vn_printf(struct vnode *vp, const char *fmt, ...) 4547 { 4548 va_list ap; 4549 char buf[256], buf2[16]; 4550 u_long flags; 4551 u_int holdcnt; 4552 short irflag; 4553 4554 va_start(ap, fmt); 4555 vprintf(fmt, ap); 4556 va_end(ap); 4557 printf("%p: ", (void *)vp); 4558 printf("type %s state %s op %p\n", vtypename[vp->v_type], 4559 vstatename[vp->v_state], vp->v_op); 4560 holdcnt = atomic_load_int(&vp->v_holdcnt); 4561 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4562 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4563 vp->v_seqc_users); 4564 switch (vp->v_type) { 4565 case VDIR: 4566 printf(" mountedhere %p\n", vp->v_mountedhere); 4567 break; 4568 case VCHR: 4569 printf(" rdev %p\n", vp->v_rdev); 4570 break; 4571 case VSOCK: 4572 printf(" socket %p\n", vp->v_unpcb); 4573 break; 4574 case VFIFO: 4575 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4576 break; 4577 default: 4578 printf("\n"); 4579 break; 4580 } 4581 buf[0] = '\0'; 4582 buf[1] = '\0'; 4583 if (holdcnt & VHOLD_NO_SMR) 4584 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4585 printf(" hold count flags (%s)\n", buf + 1); 4586 4587 buf[0] = '\0'; 4588 buf[1] = '\0'; 4589 irflag = vn_irflag_read(vp); 4590 if (irflag & VIRF_DOOMED) 4591 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4592 if (irflag & VIRF_PGREAD) 4593 strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); 4594 if (irflag & VIRF_MOUNTPOINT) 4595 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); 4596 if (irflag & VIRF_TEXT_REF) 4597 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); 4598 flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); 4599 if (flags != 0) { 4600 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4601 strlcat(buf, buf2, sizeof(buf)); 4602 } 4603 if (vp->v_vflag & VV_ROOT) 4604 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4605 if (vp->v_vflag & VV_ISTTY) 4606 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4607 if (vp->v_vflag & VV_NOSYNC) 4608 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4609 if (vp->v_vflag & VV_ETERNALDEV) 4610 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4611 if (vp->v_vflag & VV_CACHEDLABEL) 4612 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4613 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4614 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4615 if (vp->v_vflag & VV_COPYONWRITE) 4616 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4617 if (vp->v_vflag & VV_SYSTEM) 4618 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4619 if (vp->v_vflag & VV_PROCDEP) 4620 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4621 if (vp->v_vflag & VV_DELETED) 4622 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4623 if (vp->v_vflag & VV_MD) 4624 strlcat(buf, "|VV_MD", sizeof(buf)); 4625 if (vp->v_vflag & VV_FORCEINSMQ) 4626 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4627 if (vp->v_vflag & VV_READLINK) 4628 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4629 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4630 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | 4631 VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); 4632 if (flags != 0) { 4633 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4634 strlcat(buf, buf2, sizeof(buf)); 4635 } 4636 if (vp->v_iflag & VI_MOUNT) 4637 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4638 if (vp->v_iflag & VI_DOINGINACT) 4639 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4640 if (vp->v_iflag & VI_OWEINACT) 4641 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4642 if (vp->v_iflag & VI_DEFINACT) 4643 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4644 if (vp->v_iflag & VI_FOPENING) 4645 strlcat(buf, "|VI_FOPENING", sizeof(buf)); 4646 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | 4647 VI_OWEINACT | VI_DEFINACT | VI_FOPENING); 4648 if (flags != 0) { 4649 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4650 strlcat(buf, buf2, sizeof(buf)); 4651 } 4652 if (vp->v_mflag & VMP_LAZYLIST) 4653 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4654 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4655 if (flags != 0) { 4656 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4657 strlcat(buf, buf2, sizeof(buf)); 4658 } 4659 printf(" flags (%s)", buf + 1); 4660 if (mtx_owned(VI_MTX(vp))) 4661 printf(" VI_LOCKed"); 4662 printf("\n"); 4663 if (vp->v_object != NULL) 4664 printf(" v_object %p ref %d pages %d " 4665 "cleanbuf %d dirtybuf %d\n", 4666 vp->v_object, vp->v_object->ref_count, 4667 vp->v_object->resident_page_count, 4668 vp->v_bufobj.bo_clean.bv_cnt, 4669 vp->v_bufobj.bo_dirty.bv_cnt); 4670 printf(" "); 4671 lockmgr_printinfo(vp->v_vnlock); 4672 if (vp->v_data != NULL) 4673 VOP_PRINT(vp); 4674 } 4675 4676 #ifdef DDB 4677 /* 4678 * List all of the locked vnodes in the system. 4679 * Called when debugging the kernel. 4680 */ 4681 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) 4682 { 4683 struct mount *mp; 4684 struct vnode *vp; 4685 4686 /* 4687 * Note: because this is DDB, we can't obey the locking semantics 4688 * for these structures, which means we could catch an inconsistent 4689 * state and dereference a nasty pointer. Not much to be done 4690 * about that. 4691 */ 4692 db_printf("Locked vnodes\n"); 4693 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4694 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4695 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4696 vn_printf(vp, "vnode "); 4697 } 4698 } 4699 } 4700 4701 /* 4702 * Show details about the given vnode. 4703 */ 4704 DB_SHOW_COMMAND(vnode, db_show_vnode) 4705 { 4706 struct vnode *vp; 4707 4708 if (!have_addr) 4709 return; 4710 vp = (struct vnode *)addr; 4711 vn_printf(vp, "vnode "); 4712 } 4713 4714 /* 4715 * Show details about the given mount point. 4716 */ 4717 DB_SHOW_COMMAND(mount, db_show_mount) 4718 { 4719 struct mount *mp; 4720 struct vfsopt *opt; 4721 struct statfs *sp; 4722 struct vnode *vp; 4723 char buf[512]; 4724 uint64_t mflags; 4725 u_int flags; 4726 4727 if (!have_addr) { 4728 /* No address given, print short info about all mount points. */ 4729 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4730 db_printf("%p %s on %s (%s)\n", mp, 4731 mp->mnt_stat.f_mntfromname, 4732 mp->mnt_stat.f_mntonname, 4733 mp->mnt_stat.f_fstypename); 4734 if (db_pager_quit) 4735 break; 4736 } 4737 db_printf("\nMore info: show mount <addr>\n"); 4738 return; 4739 } 4740 4741 mp = (struct mount *)addr; 4742 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4743 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4744 4745 buf[0] = '\0'; 4746 mflags = mp->mnt_flag; 4747 #define MNT_FLAG(flag) do { \ 4748 if (mflags & (flag)) { \ 4749 if (buf[0] != '\0') \ 4750 strlcat(buf, ", ", sizeof(buf)); \ 4751 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4752 mflags &= ~(flag); \ 4753 } \ 4754 } while (0) 4755 MNT_FLAG(MNT_RDONLY); 4756 MNT_FLAG(MNT_SYNCHRONOUS); 4757 MNT_FLAG(MNT_NOEXEC); 4758 MNT_FLAG(MNT_NOSUID); 4759 MNT_FLAG(MNT_NFS4ACLS); 4760 MNT_FLAG(MNT_UNION); 4761 MNT_FLAG(MNT_ASYNC); 4762 MNT_FLAG(MNT_SUIDDIR); 4763 MNT_FLAG(MNT_SOFTDEP); 4764 MNT_FLAG(MNT_NOSYMFOLLOW); 4765 MNT_FLAG(MNT_GJOURNAL); 4766 MNT_FLAG(MNT_MULTILABEL); 4767 MNT_FLAG(MNT_ACLS); 4768 MNT_FLAG(MNT_NOATIME); 4769 MNT_FLAG(MNT_NOCLUSTERR); 4770 MNT_FLAG(MNT_NOCLUSTERW); 4771 MNT_FLAG(MNT_SUJ); 4772 MNT_FLAG(MNT_EXRDONLY); 4773 MNT_FLAG(MNT_EXPORTED); 4774 MNT_FLAG(MNT_DEFEXPORTED); 4775 MNT_FLAG(MNT_EXPORTANON); 4776 MNT_FLAG(MNT_EXKERB); 4777 MNT_FLAG(MNT_EXPUBLIC); 4778 MNT_FLAG(MNT_LOCAL); 4779 MNT_FLAG(MNT_QUOTA); 4780 MNT_FLAG(MNT_ROOTFS); 4781 MNT_FLAG(MNT_USER); 4782 MNT_FLAG(MNT_IGNORE); 4783 MNT_FLAG(MNT_UPDATE); 4784 MNT_FLAG(MNT_DELEXPORT); 4785 MNT_FLAG(MNT_RELOAD); 4786 MNT_FLAG(MNT_FORCE); 4787 MNT_FLAG(MNT_SNAPSHOT); 4788 MNT_FLAG(MNT_BYFSID); 4789 MNT_FLAG(MNT_NAMEDATTR); 4790 #undef MNT_FLAG 4791 if (mflags != 0) { 4792 if (buf[0] != '\0') 4793 strlcat(buf, ", ", sizeof(buf)); 4794 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4795 "0x%016jx", mflags); 4796 } 4797 db_printf(" mnt_flag = %s\n", buf); 4798 4799 buf[0] = '\0'; 4800 flags = mp->mnt_kern_flag; 4801 #define MNT_KERN_FLAG(flag) do { \ 4802 if (flags & (flag)) { \ 4803 if (buf[0] != '\0') \ 4804 strlcat(buf, ", ", sizeof(buf)); \ 4805 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4806 flags &= ~(flag); \ 4807 } \ 4808 } while (0) 4809 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4810 MNT_KERN_FLAG(MNTK_ASYNC); 4811 MNT_KERN_FLAG(MNTK_SOFTDEP); 4812 MNT_KERN_FLAG(MNTK_NOMSYNC); 4813 MNT_KERN_FLAG(MNTK_DRAINING); 4814 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4815 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4816 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4817 MNT_KERN_FLAG(MNTK_NO_IOPF); 4818 MNT_KERN_FLAG(MNTK_RECURSE); 4819 MNT_KERN_FLAG(MNTK_UPPER_WAITER); 4820 MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); 4821 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4822 MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); 4823 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4824 MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); 4825 MNT_KERN_FLAG(MNTK_NOASYNC); 4826 MNT_KERN_FLAG(MNTK_UNMOUNT); 4827 MNT_KERN_FLAG(MNTK_MWAIT); 4828 MNT_KERN_FLAG(MNTK_SUSPEND); 4829 MNT_KERN_FLAG(MNTK_SUSPEND2); 4830 MNT_KERN_FLAG(MNTK_SUSPENDED); 4831 MNT_KERN_FLAG(MNTK_NULL_NOCACHE); 4832 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4833 #undef MNT_KERN_FLAG 4834 if (flags != 0) { 4835 if (buf[0] != '\0') 4836 strlcat(buf, ", ", sizeof(buf)); 4837 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4838 "0x%08x", flags); 4839 } 4840 db_printf(" mnt_kern_flag = %s\n", buf); 4841 4842 db_printf(" mnt_opt = "); 4843 opt = TAILQ_FIRST(mp->mnt_opt); 4844 if (opt != NULL) { 4845 db_printf("%s", opt->name); 4846 opt = TAILQ_NEXT(opt, link); 4847 while (opt != NULL) { 4848 db_printf(", %s", opt->name); 4849 opt = TAILQ_NEXT(opt, link); 4850 } 4851 } 4852 db_printf("\n"); 4853 4854 sp = &mp->mnt_stat; 4855 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4856 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4857 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4858 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4859 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4860 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4861 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4862 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4863 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4864 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4865 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4866 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4867 4868 db_printf(" mnt_cred = { uid=%u ruid=%u", 4869 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4870 if (jailed(mp->mnt_cred)) 4871 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4872 db_printf(" }\n"); 4873 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4874 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4875 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4876 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4877 db_printf(" mnt_lazyvnodelistsize = %d\n", 4878 mp->mnt_lazyvnodelistsize); 4879 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4880 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4881 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4882 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4883 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4884 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4885 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4886 db_printf(" mnt_secondary_accwrites = %d\n", 4887 mp->mnt_secondary_accwrites); 4888 db_printf(" mnt_gjprovider = %s\n", 4889 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4890 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4891 4892 db_printf("\n\nList of active vnodes\n"); 4893 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4894 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4895 vn_printf(vp, "vnode "); 4896 if (db_pager_quit) 4897 break; 4898 } 4899 } 4900 db_printf("\n\nList of inactive vnodes\n"); 4901 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4902 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4903 vn_printf(vp, "vnode "); 4904 if (db_pager_quit) 4905 break; 4906 } 4907 } 4908 } 4909 #endif /* DDB */ 4910 4911 /* 4912 * Fill in a struct xvfsconf based on a struct vfsconf. 4913 */ 4914 static int 4915 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4916 { 4917 struct xvfsconf xvfsp; 4918 4919 bzero(&xvfsp, sizeof(xvfsp)); 4920 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4921 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4922 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4923 xvfsp.vfc_flags = vfsp->vfc_flags; 4924 /* 4925 * These are unused in userland, we keep them 4926 * to not break binary compatibility. 4927 */ 4928 xvfsp.vfc_vfsops = NULL; 4929 xvfsp.vfc_next = NULL; 4930 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4931 } 4932 4933 #ifdef COMPAT_FREEBSD32 4934 struct xvfsconf32 { 4935 uint32_t vfc_vfsops; 4936 char vfc_name[MFSNAMELEN]; 4937 int32_t vfc_typenum; 4938 int32_t vfc_refcount; 4939 int32_t vfc_flags; 4940 uint32_t vfc_next; 4941 }; 4942 4943 static int 4944 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4945 { 4946 struct xvfsconf32 xvfsp; 4947 4948 bzero(&xvfsp, sizeof(xvfsp)); 4949 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4950 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4951 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4952 xvfsp.vfc_flags = vfsp->vfc_flags; 4953 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4954 } 4955 #endif 4956 4957 /* 4958 * Top level filesystem related information gathering. 4959 */ 4960 static int 4961 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4962 { 4963 struct vfsconf *vfsp; 4964 int error; 4965 4966 error = 0; 4967 vfsconf_slock(); 4968 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4969 #ifdef COMPAT_FREEBSD32 4970 if (req->flags & SCTL_MASK32) 4971 error = vfsconf2x32(req, vfsp); 4972 else 4973 #endif 4974 error = vfsconf2x(req, vfsp); 4975 if (error) 4976 break; 4977 } 4978 vfsconf_sunlock(); 4979 return (error); 4980 } 4981 4982 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4983 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4984 "S,xvfsconf", "List of all configured filesystems"); 4985 4986 #ifndef BURN_BRIDGES 4987 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4988 4989 static int 4990 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4991 { 4992 int *name = (int *)arg1 - 1; /* XXX */ 4993 u_int namelen = arg2 + 1; /* XXX */ 4994 struct vfsconf *vfsp; 4995 4996 log(LOG_WARNING, "userland calling deprecated sysctl, " 4997 "please rebuild world\n"); 4998 4999 #if 1 || defined(COMPAT_PRELITE2) 5000 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 5001 if (namelen == 1) 5002 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 5003 #endif 5004 5005 switch (name[1]) { 5006 case VFS_MAXTYPENUM: 5007 if (namelen != 2) 5008 return (ENOTDIR); 5009 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 5010 case VFS_CONF: 5011 if (namelen != 3) 5012 return (ENOTDIR); /* overloaded */ 5013 vfsconf_slock(); 5014 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 5015 if (vfsp->vfc_typenum == name[2]) 5016 break; 5017 } 5018 vfsconf_sunlock(); 5019 if (vfsp == NULL) 5020 return (EOPNOTSUPP); 5021 #ifdef COMPAT_FREEBSD32 5022 if (req->flags & SCTL_MASK32) 5023 return (vfsconf2x32(req, vfsp)); 5024 else 5025 #endif 5026 return (vfsconf2x(req, vfsp)); 5027 } 5028 return (EOPNOTSUPP); 5029 } 5030 5031 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 5032 CTLFLAG_MPSAFE, vfs_sysctl, 5033 "Generic filesystem"); 5034 5035 #if 1 || defined(COMPAT_PRELITE2) 5036 5037 static int 5038 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 5039 { 5040 int error; 5041 struct vfsconf *vfsp; 5042 struct ovfsconf ovfs; 5043 5044 vfsconf_slock(); 5045 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 5046 bzero(&ovfs, sizeof(ovfs)); 5047 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 5048 strcpy(ovfs.vfc_name, vfsp->vfc_name); 5049 ovfs.vfc_index = vfsp->vfc_typenum; 5050 ovfs.vfc_refcount = vfsp->vfc_refcount; 5051 ovfs.vfc_flags = vfsp->vfc_flags; 5052 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 5053 if (error != 0) { 5054 vfsconf_sunlock(); 5055 return (error); 5056 } 5057 } 5058 vfsconf_sunlock(); 5059 return (0); 5060 } 5061 5062 #endif /* 1 || COMPAT_PRELITE2 */ 5063 #endif /* !BURN_BRIDGES */ 5064 5065 static void 5066 unmount_or_warn(struct mount *mp) 5067 { 5068 int error; 5069 5070 error = dounmount(mp, MNT_FORCE, curthread); 5071 if (error != 0) { 5072 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 5073 if (error == EBUSY) 5074 printf("BUSY)\n"); 5075 else 5076 printf("%d)\n", error); 5077 } 5078 } 5079 5080 /* 5081 * Unmount all filesystems. The list is traversed in reverse order 5082 * of mounting to avoid dependencies. 5083 */ 5084 void 5085 vfs_unmountall(void) 5086 { 5087 struct mount *mp, *tmp; 5088 5089 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 5090 5091 /* 5092 * Since this only runs when rebooting, it is not interlocked. 5093 */ 5094 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 5095 vfs_ref(mp); 5096 5097 /* 5098 * Forcibly unmounting "/dev" before "/" would prevent clean 5099 * unmount of the latter. 5100 */ 5101 if (mp == rootdevmp) 5102 continue; 5103 5104 unmount_or_warn(mp); 5105 } 5106 5107 if (rootdevmp != NULL) 5108 unmount_or_warn(rootdevmp); 5109 } 5110 5111 static void 5112 vfs_deferred_inactive(struct vnode *vp, int lkflags) 5113 { 5114 5115 ASSERT_VI_LOCKED(vp, __func__); 5116 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 5117 if ((vp->v_iflag & VI_OWEINACT) == 0) { 5118 vdropl(vp); 5119 return; 5120 } 5121 if (vn_lock(vp, lkflags) == 0) { 5122 VI_LOCK(vp); 5123 vinactive(vp); 5124 VOP_UNLOCK(vp); 5125 vdropl(vp); 5126 return; 5127 } 5128 vdefer_inactive_unlocked(vp); 5129 } 5130 5131 static int 5132 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 5133 { 5134 5135 return (vp->v_iflag & VI_DEFINACT); 5136 } 5137 5138 static void __noinline 5139 vfs_periodic_inactive(struct mount *mp, int flags) 5140 { 5141 struct vnode *vp, *mvp; 5142 int lkflags; 5143 5144 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 5145 if (flags != MNT_WAIT) 5146 lkflags |= LK_NOWAIT; 5147 5148 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 5149 if ((vp->v_iflag & VI_DEFINACT) == 0) { 5150 VI_UNLOCK(vp); 5151 continue; 5152 } 5153 vp->v_iflag &= ~VI_DEFINACT; 5154 vfs_deferred_inactive(vp, lkflags); 5155 } 5156 } 5157 5158 static inline bool 5159 vfs_want_msync(struct vnode *vp) 5160 { 5161 struct vm_object *obj; 5162 5163 /* 5164 * This test may be performed without any locks held. 5165 * We rely on vm_object's type stability. 5166 */ 5167 if (vp->v_vflag & VV_NOSYNC) 5168 return (false); 5169 obj = vp->v_object; 5170 return (obj != NULL && vm_object_mightbedirty(obj)); 5171 } 5172 5173 static int 5174 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 5175 { 5176 5177 if (vp->v_vflag & VV_NOSYNC) 5178 return (false); 5179 if (vp->v_iflag & VI_DEFINACT) 5180 return (true); 5181 return (vfs_want_msync(vp)); 5182 } 5183 5184 static void __noinline 5185 vfs_periodic_msync_inactive(struct mount *mp, int flags) 5186 { 5187 struct vnode *vp, *mvp; 5188 int lkflags; 5189 bool seen_defer; 5190 5191 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 5192 if (flags != MNT_WAIT) 5193 lkflags |= LK_NOWAIT; 5194 5195 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 5196 seen_defer = false; 5197 if (vp->v_iflag & VI_DEFINACT) { 5198 vp->v_iflag &= ~VI_DEFINACT; 5199 seen_defer = true; 5200 } 5201 if (!vfs_want_msync(vp)) { 5202 if (seen_defer) 5203 vfs_deferred_inactive(vp, lkflags); 5204 else 5205 VI_UNLOCK(vp); 5206 continue; 5207 } 5208 if (vget(vp, lkflags) == 0) { 5209 if ((vp->v_vflag & VV_NOSYNC) == 0) { 5210 if (flags == MNT_WAIT) 5211 vnode_pager_clean_sync(vp); 5212 else 5213 vnode_pager_clean_async(vp); 5214 } 5215 vput(vp); 5216 if (seen_defer) 5217 vdrop(vp); 5218 } else { 5219 if (seen_defer) 5220 vdefer_inactive_unlocked(vp); 5221 } 5222 } 5223 } 5224 5225 void 5226 vfs_periodic(struct mount *mp, int flags) 5227 { 5228 5229 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 5230 5231 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 5232 vfs_periodic_inactive(mp, flags); 5233 else 5234 vfs_periodic_msync_inactive(mp, flags); 5235 } 5236 5237 static void 5238 destroy_vpollinfo_free(struct vpollinfo *vi) 5239 { 5240 5241 knlist_destroy(&vi->vpi_selinfo.si_note); 5242 mtx_destroy(&vi->vpi_lock); 5243 free(vi, M_VNODEPOLL); 5244 } 5245 5246 static void 5247 destroy_vpollinfo(struct vpollinfo *vi) 5248 { 5249 KASSERT(TAILQ_EMPTY(&vi->vpi_inotify), 5250 ("%s: pollinfo %p has lingering watches", __func__, vi)); 5251 knlist_clear(&vi->vpi_selinfo.si_note, 1); 5252 seldrain(&vi->vpi_selinfo); 5253 destroy_vpollinfo_free(vi); 5254 } 5255 5256 /* 5257 * Initialize per-vnode helper structure to hold poll-related state. 5258 */ 5259 void 5260 v_addpollinfo(struct vnode *vp) 5261 { 5262 struct vpollinfo *vi; 5263 5264 if (atomic_load_ptr(&vp->v_pollinfo) != NULL) 5265 return; 5266 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); 5267 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 5268 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 5269 vfs_knlunlock, vfs_knl_assert_lock); 5270 TAILQ_INIT(&vi->vpi_inotify); 5271 VI_LOCK(vp); 5272 if (vp->v_pollinfo != NULL) { 5273 VI_UNLOCK(vp); 5274 destroy_vpollinfo_free(vi); 5275 return; 5276 } 5277 vp->v_pollinfo = vi; 5278 VI_UNLOCK(vp); 5279 } 5280 5281 /* 5282 * Record a process's interest in events which might happen to 5283 * a vnode. Because poll uses the historic select-style interface 5284 * internally, this routine serves as both the ``check for any 5285 * pending events'' and the ``record my interest in future events'' 5286 * functions. (These are done together, while the lock is held, 5287 * to avoid race conditions.) 5288 */ 5289 int 5290 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 5291 { 5292 5293 v_addpollinfo(vp); 5294 mtx_lock(&vp->v_pollinfo->vpi_lock); 5295 if (vp->v_pollinfo->vpi_revents & events) { 5296 /* 5297 * This leaves events we are not interested 5298 * in available for the other process which 5299 * which presumably had requested them 5300 * (otherwise they would never have been 5301 * recorded). 5302 */ 5303 events &= vp->v_pollinfo->vpi_revents; 5304 vp->v_pollinfo->vpi_revents &= ~events; 5305 5306 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5307 return (events); 5308 } 5309 vp->v_pollinfo->vpi_events |= events; 5310 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 5311 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5312 return (0); 5313 } 5314 5315 /* 5316 * Routine to create and manage a filesystem syncer vnode. 5317 */ 5318 #define sync_close ((int (*)(struct vop_close_args *))nullop) 5319 static int sync_fsync(struct vop_fsync_args *); 5320 static int sync_inactive(struct vop_inactive_args *); 5321 static int sync_reclaim(struct vop_reclaim_args *); 5322 5323 static struct vop_vector sync_vnodeops = { 5324 .vop_bypass = VOP_EOPNOTSUPP, 5325 .vop_close = sync_close, 5326 .vop_fsync = sync_fsync, 5327 .vop_getwritemount = vop_stdgetwritemount, 5328 .vop_inactive = sync_inactive, 5329 .vop_need_inactive = vop_stdneed_inactive, 5330 .vop_reclaim = sync_reclaim, 5331 .vop_lock1 = vop_stdlock, 5332 .vop_unlock = vop_stdunlock, 5333 .vop_islocked = vop_stdislocked, 5334 .vop_fplookup_vexec = VOP_EAGAIN, 5335 .vop_fplookup_symlink = VOP_EAGAIN, 5336 }; 5337 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 5338 5339 /* 5340 * Create a new filesystem syncer vnode for the specified mount point. 5341 */ 5342 void 5343 vfs_allocate_syncvnode(struct mount *mp) 5344 { 5345 struct vnode *vp; 5346 struct bufobj *bo; 5347 static long start, incr, next; 5348 int error; 5349 5350 /* Allocate a new vnode */ 5351 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 5352 if (error != 0) 5353 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 5354 vp->v_type = VNON; 5355 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5356 vp->v_vflag |= VV_FORCEINSMQ; 5357 error = insmntque1(vp, mp); 5358 if (error != 0) 5359 panic("vfs_allocate_syncvnode: insmntque() failed"); 5360 vp->v_vflag &= ~VV_FORCEINSMQ; 5361 vn_set_state(vp, VSTATE_CONSTRUCTED); 5362 VOP_UNLOCK(vp); 5363 /* 5364 * Place the vnode onto the syncer worklist. We attempt to 5365 * scatter them about on the list so that they will go off 5366 * at evenly distributed times even if all the filesystems 5367 * are mounted at once. 5368 */ 5369 next += incr; 5370 if (next == 0 || next > syncer_maxdelay) { 5371 start /= 2; 5372 incr /= 2; 5373 if (start == 0) { 5374 start = syncer_maxdelay / 2; 5375 incr = syncer_maxdelay; 5376 } 5377 next = start; 5378 } 5379 bo = &vp->v_bufobj; 5380 BO_LOCK(bo); 5381 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5382 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5383 mtx_lock(&sync_mtx); 5384 sync_vnode_count++; 5385 if (mp->mnt_syncer == NULL) { 5386 mp->mnt_syncer = vp; 5387 vp = NULL; 5388 } 5389 mtx_unlock(&sync_mtx); 5390 BO_UNLOCK(bo); 5391 if (vp != NULL) { 5392 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5393 vgone(vp); 5394 vput(vp); 5395 } 5396 } 5397 5398 void 5399 vfs_deallocate_syncvnode(struct mount *mp) 5400 { 5401 struct vnode *vp; 5402 5403 mtx_lock(&sync_mtx); 5404 vp = mp->mnt_syncer; 5405 if (vp != NULL) 5406 mp->mnt_syncer = NULL; 5407 mtx_unlock(&sync_mtx); 5408 if (vp != NULL) 5409 vrele(vp); 5410 } 5411 5412 /* 5413 * Do a lazy sync of the filesystem. 5414 */ 5415 static int 5416 sync_fsync(struct vop_fsync_args *ap) 5417 { 5418 struct vnode *syncvp = ap->a_vp; 5419 struct mount *mp = syncvp->v_mount; 5420 int error, save; 5421 struct bufobj *bo; 5422 5423 /* 5424 * We only need to do something if this is a lazy evaluation. 5425 */ 5426 if (ap->a_waitfor != MNT_LAZY) 5427 return (0); 5428 5429 /* 5430 * Move ourselves to the back of the sync list. 5431 */ 5432 bo = &syncvp->v_bufobj; 5433 BO_LOCK(bo); 5434 vn_syncer_add_to_worklist(bo, syncdelay); 5435 BO_UNLOCK(bo); 5436 5437 /* 5438 * Walk the list of vnodes pushing all that are dirty and 5439 * not already on the sync list. 5440 */ 5441 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5442 return (0); 5443 VOP_UNLOCK(syncvp); 5444 save = curthread_pflags_set(TDP_SYNCIO); 5445 /* 5446 * The filesystem at hand may be idle with free vnodes stored in the 5447 * batch. Return them instead of letting them stay there indefinitely. 5448 */ 5449 vfs_periodic(mp, MNT_NOWAIT); 5450 error = VFS_SYNC(mp, MNT_LAZY); 5451 curthread_pflags_restore(save); 5452 vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); 5453 vfs_unbusy(mp); 5454 return (error); 5455 } 5456 5457 /* 5458 * The syncer vnode is no referenced. 5459 */ 5460 static int 5461 sync_inactive(struct vop_inactive_args *ap) 5462 { 5463 5464 vgone(ap->a_vp); 5465 return (0); 5466 } 5467 5468 /* 5469 * The syncer vnode is no longer needed and is being decommissioned. 5470 * 5471 * Modifications to the worklist must be protected by sync_mtx. 5472 */ 5473 static int 5474 sync_reclaim(struct vop_reclaim_args *ap) 5475 { 5476 struct vnode *vp = ap->a_vp; 5477 struct bufobj *bo; 5478 5479 bo = &vp->v_bufobj; 5480 BO_LOCK(bo); 5481 mtx_lock(&sync_mtx); 5482 if (vp->v_mount->mnt_syncer == vp) 5483 vp->v_mount->mnt_syncer = NULL; 5484 if (bo->bo_flag & BO_ONWORKLST) { 5485 LIST_REMOVE(bo, bo_synclist); 5486 syncer_worklist_len--; 5487 sync_vnode_count--; 5488 bo->bo_flag &= ~BO_ONWORKLST; 5489 } 5490 mtx_unlock(&sync_mtx); 5491 BO_UNLOCK(bo); 5492 5493 return (0); 5494 } 5495 5496 int 5497 vn_need_pageq_flush(struct vnode *vp) 5498 { 5499 struct vm_object *obj; 5500 5501 obj = vp->v_object; 5502 return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5503 vm_object_mightbedirty(obj)); 5504 } 5505 5506 /* 5507 * Check if vnode represents a disk device 5508 */ 5509 bool 5510 vn_isdisk_error(struct vnode *vp, int *errp) 5511 { 5512 int error; 5513 5514 if (vp->v_type != VCHR) { 5515 error = ENOTBLK; 5516 goto out; 5517 } 5518 error = 0; 5519 dev_lock(); 5520 if (vp->v_rdev == NULL) 5521 error = ENXIO; 5522 else if (vp->v_rdev->si_devsw == NULL) 5523 error = ENXIO; 5524 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5525 error = ENOTBLK; 5526 dev_unlock(); 5527 out: 5528 *errp = error; 5529 return (error == 0); 5530 } 5531 5532 bool 5533 vn_isdisk(struct vnode *vp) 5534 { 5535 int error; 5536 5537 return (vn_isdisk_error(vp, &error)); 5538 } 5539 5540 /* 5541 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5542 * the comment above cache_fplookup for details. 5543 */ 5544 int 5545 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5546 { 5547 int error; 5548 5549 VFS_SMR_ASSERT_ENTERED(); 5550 5551 /* Check the owner. */ 5552 if (cred->cr_uid == file_uid) { 5553 if (file_mode & S_IXUSR) 5554 return (0); 5555 goto out_error; 5556 } 5557 5558 /* Otherwise, check the groups (first match) */ 5559 if (groupmember(file_gid, cred)) { 5560 if (file_mode & S_IXGRP) 5561 return (0); 5562 goto out_error; 5563 } 5564 5565 /* Otherwise, check everyone else. */ 5566 if (file_mode & S_IXOTH) 5567 return (0); 5568 out_error: 5569 /* 5570 * Permission check failed, but it is possible denial will get overwritten 5571 * (e.g., when root is traversing through a 700 directory owned by someone 5572 * else). 5573 * 5574 * vaccess() calls priv_check_cred which in turn can descent into MAC 5575 * modules overriding this result. It's quite unclear what semantics 5576 * are allowed for them to operate, thus for safety we don't call them 5577 * from within the SMR section. This also means if any such modules 5578 * are present, we have to let the regular lookup decide. 5579 */ 5580 error = priv_check_cred_vfs_lookup_nomac(cred); 5581 switch (error) { 5582 case 0: 5583 return (0); 5584 case EAGAIN: 5585 /* 5586 * MAC modules present. 5587 */ 5588 return (EAGAIN); 5589 case EPERM: 5590 return (EACCES); 5591 default: 5592 return (error); 5593 } 5594 } 5595 5596 /* 5597 * Common filesystem object access control check routine. Accepts a 5598 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5599 * Returns 0 on success, or an errno on failure. 5600 */ 5601 int 5602 vaccess(__enum_uint8(vtype) type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5603 accmode_t accmode, struct ucred *cred) 5604 { 5605 accmode_t dac_granted; 5606 accmode_t priv_granted; 5607 5608 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5609 ("invalid bit in accmode")); 5610 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5611 ("VAPPEND without VWRITE")); 5612 5613 /* 5614 * Look for a normal, non-privileged way to access the file/directory 5615 * as requested. If it exists, go with that. 5616 */ 5617 5618 dac_granted = 0; 5619 5620 /* Check the owner. */ 5621 if (cred->cr_uid == file_uid) { 5622 dac_granted |= VADMIN; 5623 if (file_mode & S_IXUSR) 5624 dac_granted |= VEXEC; 5625 if (file_mode & S_IRUSR) 5626 dac_granted |= VREAD; 5627 if (file_mode & S_IWUSR) 5628 dac_granted |= (VWRITE | VAPPEND); 5629 5630 if ((accmode & dac_granted) == accmode) 5631 return (0); 5632 5633 goto privcheck; 5634 } 5635 5636 /* Otherwise, check the groups (first match) */ 5637 if (groupmember(file_gid, cred)) { 5638 if (file_mode & S_IXGRP) 5639 dac_granted |= VEXEC; 5640 if (file_mode & S_IRGRP) 5641 dac_granted |= VREAD; 5642 if (file_mode & S_IWGRP) 5643 dac_granted |= (VWRITE | VAPPEND); 5644 5645 if ((accmode & dac_granted) == accmode) 5646 return (0); 5647 5648 goto privcheck; 5649 } 5650 5651 /* Otherwise, check everyone else. */ 5652 if (file_mode & S_IXOTH) 5653 dac_granted |= VEXEC; 5654 if (file_mode & S_IROTH) 5655 dac_granted |= VREAD; 5656 if (file_mode & S_IWOTH) 5657 dac_granted |= (VWRITE | VAPPEND); 5658 if ((accmode & dac_granted) == accmode) 5659 return (0); 5660 5661 privcheck: 5662 /* 5663 * Build a privilege mask to determine if the set of privileges 5664 * satisfies the requirements when combined with the granted mask 5665 * from above. For each privilege, if the privilege is required, 5666 * bitwise or the request type onto the priv_granted mask. 5667 */ 5668 priv_granted = 0; 5669 5670 if (type == VDIR) { 5671 /* 5672 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5673 * requests, instead of PRIV_VFS_EXEC. 5674 */ 5675 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5676 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5677 priv_granted |= VEXEC; 5678 } else { 5679 /* 5680 * Ensure that at least one execute bit is on. Otherwise, 5681 * a privileged user will always succeed, and we don't want 5682 * this to happen unless the file really is executable. 5683 */ 5684 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5685 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5686 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5687 priv_granted |= VEXEC; 5688 } 5689 5690 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5691 !priv_check_cred(cred, PRIV_VFS_READ)) 5692 priv_granted |= VREAD; 5693 5694 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5695 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5696 priv_granted |= (VWRITE | VAPPEND); 5697 5698 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5699 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5700 priv_granted |= VADMIN; 5701 5702 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5703 return (0); 5704 } 5705 5706 return ((accmode & VADMIN) ? EPERM : EACCES); 5707 } 5708 5709 /* 5710 * Credential check based on process requesting service, and per-attribute 5711 * permissions. 5712 */ 5713 int 5714 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5715 struct thread *td, accmode_t accmode) 5716 { 5717 5718 /* 5719 * Kernel-invoked always succeeds. 5720 */ 5721 if (cred == NOCRED) 5722 return (0); 5723 5724 /* 5725 * Do not allow privileged processes in jail to directly manipulate 5726 * system attributes. 5727 */ 5728 switch (attrnamespace) { 5729 case EXTATTR_NAMESPACE_SYSTEM: 5730 /* Potentially should be: return (EPERM); */ 5731 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5732 case EXTATTR_NAMESPACE_USER: 5733 return (VOP_ACCESS(vp, accmode, cred, td)); 5734 default: 5735 return (EPERM); 5736 } 5737 } 5738 5739 #ifdef DEBUG_VFS_LOCKS 5740 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5741 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5742 "Drop into debugger on lock violation"); 5743 5744 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5745 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5746 0, "Check for interlock across VOPs"); 5747 5748 int vfs_badlock_print = 1; /* Print lock violations. */ 5749 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5750 0, "Print lock violations"); 5751 5752 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5753 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5754 0, "Print vnode details on lock violations"); 5755 5756 #ifdef KDB 5757 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5758 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5759 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5760 #endif 5761 5762 static void 5763 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5764 { 5765 5766 #ifdef KDB 5767 if (vfs_badlock_backtrace) 5768 kdb_backtrace(); 5769 #endif 5770 if (vfs_badlock_vnode) 5771 vn_printf(vp, "vnode "); 5772 if (vfs_badlock_print) 5773 printf("%s: %p %s\n", str, (void *)vp, msg); 5774 if (vfs_badlock_ddb) 5775 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5776 } 5777 5778 void 5779 assert_vi_locked(struct vnode *vp, const char *str) 5780 { 5781 5782 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5783 vfs_badlock("interlock is not locked but should be", str, vp); 5784 } 5785 5786 void 5787 assert_vi_unlocked(struct vnode *vp, const char *str) 5788 { 5789 5790 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5791 vfs_badlock("interlock is locked but should not be", str, vp); 5792 } 5793 5794 void 5795 assert_vop_locked(struct vnode *vp, const char *str) 5796 { 5797 if (KERNEL_PANICKED() || vp == NULL) 5798 return; 5799 5800 #ifdef WITNESS 5801 if ((vp->v_irflag & VIRF_CROSSMP) == 0 && 5802 witness_is_owned(&vp->v_vnlock->lock_object) == -1) 5803 #else 5804 int locked = VOP_ISLOCKED(vp); 5805 if (locked == 0 || locked == LK_EXCLOTHER) 5806 #endif 5807 vfs_badlock("is not locked but should be", str, vp); 5808 } 5809 5810 void 5811 assert_vop_unlocked(struct vnode *vp, const char *str) 5812 { 5813 if (KERNEL_PANICKED() || vp == NULL) 5814 return; 5815 5816 #ifdef WITNESS 5817 if ((vp->v_irflag & VIRF_CROSSMP) == 0 && 5818 witness_is_owned(&vp->v_vnlock->lock_object) == 1) 5819 #else 5820 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5821 #endif 5822 vfs_badlock("is locked but should not be", str, vp); 5823 } 5824 5825 void 5826 assert_vop_elocked(struct vnode *vp, const char *str) 5827 { 5828 if (KERNEL_PANICKED() || vp == NULL) 5829 return; 5830 5831 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5832 vfs_badlock("is not exclusive locked but should be", str, vp); 5833 } 5834 #endif /* DEBUG_VFS_LOCKS */ 5835 5836 void 5837 vop_rename_fail(struct vop_rename_args *ap) 5838 { 5839 5840 if (ap->a_tvp != NULL) 5841 vput(ap->a_tvp); 5842 if (ap->a_tdvp == ap->a_tvp) 5843 vrele(ap->a_tdvp); 5844 else 5845 vput(ap->a_tdvp); 5846 vrele(ap->a_fdvp); 5847 vrele(ap->a_fvp); 5848 } 5849 5850 void 5851 vop_rename_pre(void *ap) 5852 { 5853 struct vop_rename_args *a = ap; 5854 5855 #ifdef DEBUG_VFS_LOCKS 5856 struct mount *tmp; 5857 5858 if (a->a_tvp) 5859 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5860 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5861 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5862 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5863 5864 /* Check the source (from). */ 5865 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5866 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5867 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5868 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5869 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5870 5871 /* Check the target. */ 5872 if (a->a_tvp) 5873 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5874 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5875 5876 tmp = NULL; 5877 VOP_GETWRITEMOUNT(a->a_tdvp, &tmp); 5878 lockmgr_assert(&tmp->mnt_renamelock, KA_XLOCKED); 5879 vfs_rel(tmp); 5880 #endif 5881 /* 5882 * It may be tempting to add vn_seqc_write_begin/end calls here and 5883 * in vop_rename_post but that's not going to work out since some 5884 * filesystems relookup vnodes mid-rename. This is probably a bug. 5885 * 5886 * For now filesystems are expected to do the relevant calls after they 5887 * decide what vnodes to operate on. 5888 */ 5889 if (a->a_tdvp != a->a_fdvp) 5890 vhold(a->a_fdvp); 5891 if (a->a_tvp != a->a_fvp) 5892 vhold(a->a_fvp); 5893 vhold(a->a_tdvp); 5894 if (a->a_tvp) 5895 vhold(a->a_tvp); 5896 } 5897 5898 #ifdef DEBUG_VFS_LOCKS 5899 void 5900 vop_fplookup_vexec_debugpre(void *ap __unused) 5901 { 5902 5903 VFS_SMR_ASSERT_ENTERED(); 5904 } 5905 5906 void 5907 vop_fplookup_vexec_debugpost(void *ap, int rc) 5908 { 5909 struct vop_fplookup_vexec_args *a; 5910 struct vnode *vp; 5911 5912 a = ap; 5913 vp = a->a_vp; 5914 5915 VFS_SMR_ASSERT_ENTERED(); 5916 if (rc == EOPNOTSUPP) 5917 VNPASS(VN_IS_DOOMED(vp), vp); 5918 } 5919 5920 void 5921 vop_fplookup_symlink_debugpre(void *ap __unused) 5922 { 5923 5924 VFS_SMR_ASSERT_ENTERED(); 5925 } 5926 5927 void 5928 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) 5929 { 5930 5931 VFS_SMR_ASSERT_ENTERED(); 5932 } 5933 5934 static void 5935 vop_fsync_debugprepost(struct vnode *vp, const char *name) 5936 { 5937 if (vp->v_type == VCHR) 5938 ; 5939 /* 5940 * The shared vs. exclusive locking policy for fsync() 5941 * is actually determined by vp's write mount as indicated 5942 * by VOP_GETWRITEMOUNT(), which for stacked filesystems 5943 * may not be the same as vp->v_mount. However, if the 5944 * underlying filesystem which really handles the fsync() 5945 * supports shared locking, the stacked filesystem must also 5946 * be prepared for its VOP_FSYNC() operation to be called 5947 * with only a shared lock. On the other hand, if the 5948 * stacked filesystem claims support for shared write 5949 * locking but the underlying filesystem does not, and the 5950 * caller incorrectly uses a shared lock, this condition 5951 * should still be caught when the stacked filesystem 5952 * invokes VOP_FSYNC() on the underlying filesystem. 5953 */ 5954 else if (MNT_SHARED_WRITES(vp->v_mount)) 5955 ASSERT_VOP_LOCKED(vp, name); 5956 else 5957 ASSERT_VOP_ELOCKED(vp, name); 5958 } 5959 5960 void 5961 vop_fsync_debugpre(void *a) 5962 { 5963 struct vop_fsync_args *ap; 5964 5965 ap = a; 5966 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5967 } 5968 5969 void 5970 vop_fsync_debugpost(void *a, int rc __unused) 5971 { 5972 struct vop_fsync_args *ap; 5973 5974 ap = a; 5975 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5976 } 5977 5978 void 5979 vop_fdatasync_debugpre(void *a) 5980 { 5981 struct vop_fdatasync_args *ap; 5982 5983 ap = a; 5984 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5985 } 5986 5987 void 5988 vop_fdatasync_debugpost(void *a, int rc __unused) 5989 { 5990 struct vop_fdatasync_args *ap; 5991 5992 ap = a; 5993 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5994 } 5995 5996 void 5997 vop_strategy_debugpre(void *ap) 5998 { 5999 struct vop_strategy_args *a; 6000 struct buf *bp; 6001 6002 a = ap; 6003 bp = a->a_bp; 6004 6005 /* 6006 * Cluster ops lock their component buffers but not the IO container. 6007 */ 6008 if ((bp->b_flags & B_CLUSTER) != 0) 6009 return; 6010 6011 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 6012 if (vfs_badlock_print) 6013 printf( 6014 "VOP_STRATEGY: bp is not locked but should be\n"); 6015 if (vfs_badlock_ddb) 6016 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 6017 } 6018 } 6019 6020 void 6021 vop_lock_debugpre(void *ap) 6022 { 6023 struct vop_lock1_args *a = ap; 6024 6025 if ((a->a_flags & LK_INTERLOCK) == 0) 6026 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 6027 else 6028 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 6029 } 6030 6031 void 6032 vop_lock_debugpost(void *ap, int rc) 6033 { 6034 struct vop_lock1_args *a = ap; 6035 6036 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 6037 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 6038 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 6039 } 6040 6041 void 6042 vop_unlock_debugpre(void *ap) 6043 { 6044 struct vop_unlock_args *a = ap; 6045 struct vnode *vp = a->a_vp; 6046 6047 VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp); 6048 ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK"); 6049 } 6050 6051 void 6052 vop_need_inactive_debugpre(void *ap) 6053 { 6054 struct vop_need_inactive_args *a = ap; 6055 6056 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 6057 } 6058 6059 void 6060 vop_need_inactive_debugpost(void *ap, int rc) 6061 { 6062 struct vop_need_inactive_args *a = ap; 6063 6064 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 6065 } 6066 #endif 6067 6068 void 6069 vop_allocate_post(void *ap, int rc) 6070 { 6071 struct vop_allocate_args *a; 6072 6073 a = ap; 6074 if (rc == 0) 6075 INOTIFY(a->a_vp, IN_MODIFY); 6076 } 6077 6078 void 6079 vop_copy_file_range_post(void *ap, int rc) 6080 { 6081 struct vop_copy_file_range_args *a; 6082 6083 a = ap; 6084 if (rc == 0) { 6085 INOTIFY(a->a_invp, IN_ACCESS); 6086 INOTIFY(a->a_outvp, IN_MODIFY); 6087 } 6088 } 6089 6090 void 6091 vop_create_pre(void *ap) 6092 { 6093 struct vop_create_args *a; 6094 struct vnode *dvp; 6095 6096 a = ap; 6097 dvp = a->a_dvp; 6098 vn_seqc_write_begin(dvp); 6099 } 6100 6101 void 6102 vop_create_post(void *ap, int rc) 6103 { 6104 struct vop_create_args *a; 6105 struct vnode *dvp; 6106 6107 a = ap; 6108 dvp = a->a_dvp; 6109 vn_seqc_write_end(dvp); 6110 if (!rc) { 6111 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6112 INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); 6113 } 6114 } 6115 6116 void 6117 vop_deallocate_post(void *ap, int rc) 6118 { 6119 struct vop_deallocate_args *a; 6120 6121 a = ap; 6122 if (rc == 0) 6123 INOTIFY(a->a_vp, IN_MODIFY); 6124 } 6125 6126 void 6127 vop_whiteout_pre(void *ap) 6128 { 6129 struct vop_whiteout_args *a; 6130 struct vnode *dvp; 6131 6132 a = ap; 6133 dvp = a->a_dvp; 6134 vn_seqc_write_begin(dvp); 6135 } 6136 6137 void 6138 vop_whiteout_post(void *ap, int rc) 6139 { 6140 struct vop_whiteout_args *a; 6141 struct vnode *dvp; 6142 6143 a = ap; 6144 dvp = a->a_dvp; 6145 vn_seqc_write_end(dvp); 6146 } 6147 6148 void 6149 vop_deleteextattr_pre(void *ap) 6150 { 6151 struct vop_deleteextattr_args *a; 6152 struct vnode *vp; 6153 6154 a = ap; 6155 vp = a->a_vp; 6156 vn_seqc_write_begin(vp); 6157 } 6158 6159 void 6160 vop_deleteextattr_post(void *ap, int rc) 6161 { 6162 struct vop_deleteextattr_args *a; 6163 struct vnode *vp; 6164 6165 a = ap; 6166 vp = a->a_vp; 6167 vn_seqc_write_end(vp); 6168 if (!rc) { 6169 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 6170 INOTIFY(vp, IN_ATTRIB); 6171 } 6172 } 6173 6174 void 6175 vop_link_pre(void *ap) 6176 { 6177 struct vop_link_args *a; 6178 struct vnode *vp, *tdvp; 6179 6180 a = ap; 6181 vp = a->a_vp; 6182 tdvp = a->a_tdvp; 6183 vn_seqc_write_begin(vp); 6184 vn_seqc_write_begin(tdvp); 6185 } 6186 6187 void 6188 vop_link_post(void *ap, int rc) 6189 { 6190 struct vop_link_args *a; 6191 struct vnode *vp, *tdvp; 6192 6193 a = ap; 6194 vp = a->a_vp; 6195 tdvp = a->a_tdvp; 6196 vn_seqc_write_end(vp); 6197 vn_seqc_write_end(tdvp); 6198 if (!rc) { 6199 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 6200 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 6201 INOTIFY_NAME(vp, tdvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT); 6202 INOTIFY_NAME(vp, tdvp, a->a_cnp, IN_CREATE); 6203 } 6204 } 6205 6206 void 6207 vop_mkdir_pre(void *ap) 6208 { 6209 struct vop_mkdir_args *a; 6210 struct vnode *dvp; 6211 6212 a = ap; 6213 dvp = a->a_dvp; 6214 vn_seqc_write_begin(dvp); 6215 } 6216 6217 void 6218 vop_mkdir_post(void *ap, int rc) 6219 { 6220 struct vop_mkdir_args *a; 6221 struct vnode *dvp; 6222 6223 a = ap; 6224 dvp = a->a_dvp; 6225 vn_seqc_write_end(dvp); 6226 if (!rc) { 6227 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6228 INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); 6229 } 6230 } 6231 6232 #ifdef DEBUG_VFS_LOCKS 6233 void 6234 vop_mkdir_debugpost(void *ap, int rc) 6235 { 6236 struct vop_mkdir_args *a; 6237 6238 a = ap; 6239 if (!rc) 6240 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); 6241 } 6242 #endif 6243 6244 void 6245 vop_mknod_pre(void *ap) 6246 { 6247 struct vop_mknod_args *a; 6248 struct vnode *dvp; 6249 6250 a = ap; 6251 dvp = a->a_dvp; 6252 vn_seqc_write_begin(dvp); 6253 } 6254 6255 void 6256 vop_mknod_post(void *ap, int rc) 6257 { 6258 struct vop_mknod_args *a; 6259 struct vnode *dvp; 6260 6261 a = ap; 6262 dvp = a->a_dvp; 6263 vn_seqc_write_end(dvp); 6264 if (!rc) { 6265 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6266 INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); 6267 } 6268 } 6269 6270 void 6271 vop_reclaim_post(void *ap, int rc) 6272 { 6273 struct vop_reclaim_args *a; 6274 struct vnode *vp; 6275 6276 a = ap; 6277 vp = a->a_vp; 6278 ASSERT_VOP_IN_SEQC(vp); 6279 if (!rc) { 6280 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 6281 INOTIFY_REVOKE(vp); 6282 } 6283 } 6284 6285 void 6286 vop_remove_pre(void *ap) 6287 { 6288 struct vop_remove_args *a; 6289 struct vnode *dvp, *vp; 6290 6291 a = ap; 6292 dvp = a->a_dvp; 6293 vp = a->a_vp; 6294 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); 6295 vn_seqc_write_begin(dvp); 6296 vn_seqc_write_begin(vp); 6297 } 6298 6299 void 6300 vop_remove_post(void *ap, int rc) 6301 { 6302 struct vop_remove_args *a; 6303 struct vnode *dvp, *vp; 6304 6305 a = ap; 6306 dvp = a->a_dvp; 6307 vp = a->a_vp; 6308 vn_seqc_write_end(dvp); 6309 vn_seqc_write_end(vp); 6310 if (!rc) { 6311 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6312 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6313 INOTIFY_NAME(vp, dvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT); 6314 INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE); 6315 } 6316 } 6317 6318 void 6319 vop_rename_post(void *ap, int rc) 6320 { 6321 struct vop_rename_args *a = ap; 6322 long hint; 6323 6324 if (!rc) { 6325 hint = NOTE_WRITE; 6326 if (a->a_fdvp == a->a_tdvp) { 6327 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 6328 hint |= NOTE_LINK; 6329 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 6330 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 6331 } else { 6332 hint |= NOTE_EXTEND; 6333 if (a->a_fvp->v_type == VDIR) 6334 hint |= NOTE_LINK; 6335 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 6336 6337 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 6338 a->a_tvp->v_type == VDIR) 6339 hint &= ~NOTE_LINK; 6340 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 6341 } 6342 6343 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 6344 if (a->a_tvp) 6345 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 6346 INOTIFY_MOVE(a->a_fvp, a->a_fdvp, a->a_fcnp, a->a_tvp, 6347 a->a_tdvp, a->a_tcnp); 6348 } 6349 if (a->a_tdvp != a->a_fdvp) 6350 vdrop(a->a_fdvp); 6351 if (a->a_tvp != a->a_fvp) 6352 vdrop(a->a_fvp); 6353 vdrop(a->a_tdvp); 6354 if (a->a_tvp) 6355 vdrop(a->a_tvp); 6356 } 6357 6358 void 6359 vop_rmdir_pre(void *ap) 6360 { 6361 struct vop_rmdir_args *a; 6362 struct vnode *dvp, *vp; 6363 6364 a = ap; 6365 dvp = a->a_dvp; 6366 vp = a->a_vp; 6367 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); 6368 vn_seqc_write_begin(dvp); 6369 vn_seqc_write_begin(vp); 6370 } 6371 6372 void 6373 vop_rmdir_post(void *ap, int rc) 6374 { 6375 struct vop_rmdir_args *a; 6376 struct vnode *dvp, *vp; 6377 6378 a = ap; 6379 dvp = a->a_dvp; 6380 vp = a->a_vp; 6381 vn_seqc_write_end(dvp); 6382 vn_seqc_write_end(vp); 6383 if (!rc) { 6384 vp->v_vflag |= VV_UNLINKED; 6385 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6386 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6387 INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE); 6388 } 6389 } 6390 6391 void 6392 vop_setattr_pre(void *ap) 6393 { 6394 struct vop_setattr_args *a; 6395 struct vnode *vp; 6396 6397 a = ap; 6398 vp = a->a_vp; 6399 vn_seqc_write_begin(vp); 6400 } 6401 6402 void 6403 vop_setattr_post(void *ap, int rc) 6404 { 6405 struct vop_setattr_args *a; 6406 struct vnode *vp; 6407 6408 a = ap; 6409 vp = a->a_vp; 6410 vn_seqc_write_end(vp); 6411 if (!rc) { 6412 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6413 INOTIFY(vp, IN_ATTRIB); 6414 } 6415 } 6416 6417 void 6418 vop_setacl_pre(void *ap) 6419 { 6420 struct vop_setacl_args *a; 6421 struct vnode *vp; 6422 6423 a = ap; 6424 vp = a->a_vp; 6425 vn_seqc_write_begin(vp); 6426 } 6427 6428 void 6429 vop_setacl_post(void *ap, int rc __unused) 6430 { 6431 struct vop_setacl_args *a; 6432 struct vnode *vp; 6433 6434 a = ap; 6435 vp = a->a_vp; 6436 vn_seqc_write_end(vp); 6437 } 6438 6439 void 6440 vop_setextattr_pre(void *ap) 6441 { 6442 struct vop_setextattr_args *a; 6443 struct vnode *vp; 6444 6445 a = ap; 6446 vp = a->a_vp; 6447 vn_seqc_write_begin(vp); 6448 } 6449 6450 void 6451 vop_setextattr_post(void *ap, int rc) 6452 { 6453 struct vop_setextattr_args *a; 6454 struct vnode *vp; 6455 6456 a = ap; 6457 vp = a->a_vp; 6458 vn_seqc_write_end(vp); 6459 if (!rc) { 6460 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6461 INOTIFY(vp, IN_ATTRIB); 6462 } 6463 } 6464 6465 void 6466 vop_symlink_pre(void *ap) 6467 { 6468 struct vop_symlink_args *a; 6469 struct vnode *dvp; 6470 6471 a = ap; 6472 dvp = a->a_dvp; 6473 vn_seqc_write_begin(dvp); 6474 } 6475 6476 void 6477 vop_symlink_post(void *ap, int rc) 6478 { 6479 struct vop_symlink_args *a; 6480 struct vnode *dvp; 6481 6482 a = ap; 6483 dvp = a->a_dvp; 6484 vn_seqc_write_end(dvp); 6485 if (!rc) { 6486 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6487 INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); 6488 } 6489 } 6490 6491 void 6492 vop_open_post(void *ap, int rc) 6493 { 6494 struct vop_open_args *a = ap; 6495 6496 if (!rc) { 6497 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6498 INOTIFY(a->a_vp, IN_OPEN); 6499 } 6500 } 6501 6502 void 6503 vop_close_post(void *ap, int rc) 6504 { 6505 struct vop_close_args *a = ap; 6506 6507 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6508 !VN_IS_DOOMED(a->a_vp))) { 6509 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6510 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6511 INOTIFY(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6512 IN_CLOSE_WRITE : IN_CLOSE_NOWRITE); 6513 } 6514 } 6515 6516 void 6517 vop_read_post(void *ap, int rc) 6518 { 6519 struct vop_read_args *a = ap; 6520 6521 if (!rc) { 6522 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6523 INOTIFY(a->a_vp, IN_ACCESS); 6524 } 6525 } 6526 6527 void 6528 vop_read_pgcache_post(void *ap, int rc) 6529 { 6530 struct vop_read_pgcache_args *a = ap; 6531 6532 if (!rc) 6533 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); 6534 } 6535 6536 void 6537 vop_readdir_post(void *ap, int rc) 6538 { 6539 struct vop_readdir_args *a = ap; 6540 6541 if (!rc) { 6542 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6543 INOTIFY(a->a_vp, IN_ACCESS); 6544 } 6545 } 6546 6547 static struct knlist fs_knlist; 6548 6549 static void 6550 vfs_event_init(void *arg) 6551 { 6552 knlist_init_mtx(&fs_knlist, NULL); 6553 } 6554 /* XXX - correct order? */ 6555 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6556 6557 void 6558 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6559 { 6560 6561 KNOTE_UNLOCKED(&fs_knlist, event); 6562 } 6563 6564 static int filt_fsattach(struct knote *kn); 6565 static void filt_fsdetach(struct knote *kn); 6566 static int filt_fsevent(struct knote *kn, long hint); 6567 6568 const struct filterops fs_filtops = { 6569 .f_isfd = 0, 6570 .f_attach = filt_fsattach, 6571 .f_detach = filt_fsdetach, 6572 .f_event = filt_fsevent, 6573 }; 6574 6575 static int 6576 filt_fsattach(struct knote *kn) 6577 { 6578 6579 kn->kn_flags |= EV_CLEAR; 6580 knlist_add(&fs_knlist, kn, 0); 6581 return (0); 6582 } 6583 6584 static void 6585 filt_fsdetach(struct knote *kn) 6586 { 6587 6588 knlist_remove(&fs_knlist, kn, 0); 6589 } 6590 6591 static int 6592 filt_fsevent(struct knote *kn, long hint) 6593 { 6594 6595 kn->kn_fflags |= kn->kn_sfflags & hint; 6596 6597 return (kn->kn_fflags != 0); 6598 } 6599 6600 static int 6601 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6602 { 6603 struct vfsidctl vc; 6604 int error; 6605 struct mount *mp; 6606 6607 if (req->newptr == NULL) 6608 return (EINVAL); 6609 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6610 if (error) 6611 return (error); 6612 if (vc.vc_vers != VFS_CTL_VERS1) 6613 return (EINVAL); 6614 mp = vfs_getvfs(&vc.vc_fsid); 6615 if (mp == NULL) 6616 return (ENOENT); 6617 /* ensure that a specific sysctl goes to the right filesystem. */ 6618 if (strcmp(vc.vc_fstypename, "*") != 0 && 6619 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6620 vfs_rel(mp); 6621 return (EINVAL); 6622 } 6623 VCTLTOREQ(&vc, req); 6624 error = VFS_SYSCTL(mp, vc.vc_op, req); 6625 vfs_rel(mp); 6626 return (error); 6627 } 6628 6629 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6630 NULL, 0, sysctl_vfs_ctl, "", 6631 "Sysctl by fsid"); 6632 6633 /* 6634 * Function to initialize a va_filerev field sensibly. 6635 * XXX: Wouldn't a random number make a lot more sense ?? 6636 */ 6637 u_quad_t 6638 init_va_filerev(void) 6639 { 6640 struct bintime bt; 6641 6642 getbinuptime(&bt); 6643 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6644 } 6645 6646 static int filt_vfsread(struct knote *kn, long hint); 6647 static int filt_vfswrite(struct knote *kn, long hint); 6648 static int filt_vfsvnode(struct knote *kn, long hint); 6649 static void filt_vfsdetach(struct knote *kn); 6650 static int filt_vfsdump(struct proc *p, struct knote *kn, 6651 struct kinfo_knote *kin); 6652 6653 static const struct filterops vfsread_filtops = { 6654 .f_isfd = 1, 6655 .f_detach = filt_vfsdetach, 6656 .f_event = filt_vfsread, 6657 .f_userdump = filt_vfsdump, 6658 }; 6659 static const struct filterops vfswrite_filtops = { 6660 .f_isfd = 1, 6661 .f_detach = filt_vfsdetach, 6662 .f_event = filt_vfswrite, 6663 .f_userdump = filt_vfsdump, 6664 }; 6665 static const struct filterops vfsvnode_filtops = { 6666 .f_isfd = 1, 6667 .f_detach = filt_vfsdetach, 6668 .f_event = filt_vfsvnode, 6669 .f_userdump = filt_vfsdump, 6670 }; 6671 6672 static void 6673 vfs_knllock(void *arg) 6674 { 6675 struct vnode *vp = arg; 6676 6677 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6678 } 6679 6680 static void 6681 vfs_knlunlock(void *arg) 6682 { 6683 struct vnode *vp = arg; 6684 6685 VOP_UNLOCK(vp); 6686 } 6687 6688 static void 6689 vfs_knl_assert_lock(void *arg, int what) 6690 { 6691 #ifdef DEBUG_VFS_LOCKS 6692 struct vnode *vp = arg; 6693 6694 if (what == LA_LOCKED) 6695 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6696 else 6697 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6698 #endif 6699 } 6700 6701 int 6702 vfs_kqfilter(struct vop_kqfilter_args *ap) 6703 { 6704 struct vnode *vp = ap->a_vp; 6705 struct knote *kn = ap->a_kn; 6706 struct knlist *knl; 6707 6708 KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && 6709 kn->kn_filter != EVFILT_WRITE), 6710 ("READ/WRITE filter on a FIFO leaked through")); 6711 switch (kn->kn_filter) { 6712 case EVFILT_READ: 6713 kn->kn_fop = &vfsread_filtops; 6714 break; 6715 case EVFILT_WRITE: 6716 kn->kn_fop = &vfswrite_filtops; 6717 break; 6718 case EVFILT_VNODE: 6719 kn->kn_fop = &vfsvnode_filtops; 6720 break; 6721 default: 6722 return (EINVAL); 6723 } 6724 6725 kn->kn_hook = (caddr_t)vp; 6726 6727 v_addpollinfo(vp); 6728 if (vp->v_pollinfo == NULL) 6729 return (ENOMEM); 6730 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6731 vhold(vp); 6732 knlist_add(knl, kn, 0); 6733 6734 return (0); 6735 } 6736 6737 /* 6738 * Detach knote from vnode 6739 */ 6740 static void 6741 filt_vfsdetach(struct knote *kn) 6742 { 6743 struct vnode *vp = (struct vnode *)kn->kn_hook; 6744 6745 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6746 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6747 vdrop(vp); 6748 } 6749 6750 /*ARGSUSED*/ 6751 static int 6752 filt_vfsread(struct knote *kn, long hint) 6753 { 6754 struct vnode *vp = (struct vnode *)kn->kn_hook; 6755 off_t size; 6756 int res; 6757 6758 /* 6759 * filesystem is gone, so set the EOF flag and schedule 6760 * the knote for deletion. 6761 */ 6762 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6763 VI_LOCK(vp); 6764 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6765 VI_UNLOCK(vp); 6766 return (1); 6767 } 6768 6769 if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0) 6770 return (0); 6771 6772 VI_LOCK(vp); 6773 kn->kn_data = size - kn->kn_fp->f_offset; 6774 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6775 VI_UNLOCK(vp); 6776 return (res); 6777 } 6778 6779 /*ARGSUSED*/ 6780 static int 6781 filt_vfswrite(struct knote *kn, long hint) 6782 { 6783 struct vnode *vp = (struct vnode *)kn->kn_hook; 6784 6785 VI_LOCK(vp); 6786 6787 /* 6788 * filesystem is gone, so set the EOF flag and schedule 6789 * the knote for deletion. 6790 */ 6791 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6792 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6793 6794 kn->kn_data = 0; 6795 VI_UNLOCK(vp); 6796 return (1); 6797 } 6798 6799 static int 6800 filt_vfsvnode(struct knote *kn, long hint) 6801 { 6802 struct vnode *vp = (struct vnode *)kn->kn_hook; 6803 int res; 6804 6805 VI_LOCK(vp); 6806 if (kn->kn_sfflags & hint) 6807 kn->kn_fflags |= hint; 6808 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6809 kn->kn_flags |= EV_EOF; 6810 VI_UNLOCK(vp); 6811 return (1); 6812 } 6813 res = (kn->kn_fflags != 0); 6814 VI_UNLOCK(vp); 6815 return (res); 6816 } 6817 6818 static int 6819 filt_vfsdump(struct proc *p, struct knote *kn, struct kinfo_knote *kin) 6820 { 6821 struct vattr va; 6822 struct vnode *vp; 6823 char *fullpath, *freepath; 6824 int error; 6825 6826 kin->knt_extdata = KNOTE_EXTDATA_VNODE; 6827 6828 vp = kn->kn_fp->f_vnode; 6829 kin->knt_vnode.knt_vnode_type = vntype_to_kinfo(vp->v_type); 6830 6831 va.va_fsid = VNOVAL; 6832 vn_lock(vp, LK_SHARED | LK_RETRY); 6833 error = VOP_GETATTR(vp, &va, curthread->td_ucred); 6834 VOP_UNLOCK(vp); 6835 if (error != 0) 6836 return (error); 6837 kin->knt_vnode.knt_vnode_fsid = va.va_fsid; 6838 kin->knt_vnode.knt_vnode_fileid = va.va_fileid; 6839 6840 freepath = NULL; 6841 fullpath = "-"; 6842 error = vn_fullpath(vp, &fullpath, &freepath); 6843 if (error == 0) { 6844 strlcpy(kin->knt_vnode.knt_vnode_fullpath, fullpath, 6845 sizeof(kin->knt_vnode.knt_vnode_fullpath)); 6846 } 6847 if (freepath != NULL) 6848 free(freepath, M_TEMP); 6849 6850 return (0); 6851 } 6852 6853 int 6854 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6855 { 6856 int error; 6857 6858 if (dp->d_reclen > ap->a_uio->uio_resid) 6859 return (ENAMETOOLONG); 6860 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6861 if (error) { 6862 if (ap->a_ncookies != NULL) { 6863 if (ap->a_cookies != NULL) 6864 free(ap->a_cookies, M_TEMP); 6865 ap->a_cookies = NULL; 6866 *ap->a_ncookies = 0; 6867 } 6868 return (error); 6869 } 6870 if (ap->a_ncookies == NULL) 6871 return (0); 6872 6873 KASSERT(ap->a_cookies, 6874 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6875 6876 *ap->a_cookies = realloc(*ap->a_cookies, 6877 (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); 6878 (*ap->a_cookies)[*ap->a_ncookies] = off; 6879 *ap->a_ncookies += 1; 6880 return (0); 6881 } 6882 6883 /* 6884 * The purpose of this routine is to remove granularity from accmode_t, 6885 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6886 * VADMIN and VAPPEND. 6887 * 6888 * If it returns 0, the caller is supposed to continue with the usual 6889 * access checks using 'accmode' as modified by this routine. If it 6890 * returns nonzero value, the caller is supposed to return that value 6891 * as errno. 6892 * 6893 * Note that after this routine runs, accmode may be zero. 6894 */ 6895 int 6896 vfs_unixify_accmode(accmode_t *accmode) 6897 { 6898 /* 6899 * There is no way to specify explicit "deny" rule using 6900 * file mode or POSIX.1e ACLs. 6901 */ 6902 if (*accmode & VEXPLICIT_DENY) { 6903 *accmode = 0; 6904 return (0); 6905 } 6906 6907 /* 6908 * None of these can be translated into usual access bits. 6909 * Also, the common case for NFSv4 ACLs is to not contain 6910 * either of these bits. Caller should check for VWRITE 6911 * on the containing directory instead. 6912 */ 6913 if (*accmode & (VDELETE_CHILD | VDELETE)) 6914 return (EPERM); 6915 6916 if (*accmode & VADMIN_PERMS) { 6917 *accmode &= ~VADMIN_PERMS; 6918 *accmode |= VADMIN; 6919 } 6920 6921 /* 6922 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6923 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6924 */ 6925 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6926 6927 return (0); 6928 } 6929 6930 /* 6931 * Clear out a doomed vnode (if any) and replace it with a new one as long 6932 * as the fs is not being unmounted. Return the root vnode to the caller. 6933 */ 6934 static int __noinline 6935 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6936 { 6937 struct vnode *vp; 6938 int error; 6939 6940 restart: 6941 if (mp->mnt_rootvnode != NULL) { 6942 MNT_ILOCK(mp); 6943 vp = mp->mnt_rootvnode; 6944 if (vp != NULL) { 6945 if (!VN_IS_DOOMED(vp)) { 6946 vrefact(vp); 6947 MNT_IUNLOCK(mp); 6948 error = vn_lock(vp, flags); 6949 if (error == 0) { 6950 *vpp = vp; 6951 return (0); 6952 } 6953 vrele(vp); 6954 goto restart; 6955 } 6956 /* 6957 * Clear the old one. 6958 */ 6959 mp->mnt_rootvnode = NULL; 6960 } 6961 MNT_IUNLOCK(mp); 6962 if (vp != NULL) { 6963 vfs_op_barrier_wait(mp); 6964 vrele(vp); 6965 } 6966 } 6967 error = VFS_CACHEDROOT(mp, flags, vpp); 6968 if (error != 0) 6969 return (error); 6970 if (mp->mnt_vfs_ops == 0) { 6971 MNT_ILOCK(mp); 6972 if (mp->mnt_vfs_ops != 0) { 6973 MNT_IUNLOCK(mp); 6974 return (0); 6975 } 6976 if (mp->mnt_rootvnode == NULL) { 6977 vrefact(*vpp); 6978 mp->mnt_rootvnode = *vpp; 6979 } else { 6980 if (mp->mnt_rootvnode != *vpp) { 6981 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6982 panic("%s: mismatch between vnode returned " 6983 " by VFS_CACHEDROOT and the one cached " 6984 " (%p != %p)", 6985 __func__, *vpp, mp->mnt_rootvnode); 6986 } 6987 } 6988 } 6989 MNT_IUNLOCK(mp); 6990 } 6991 return (0); 6992 } 6993 6994 int 6995 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6996 { 6997 struct mount_pcpu *mpcpu; 6998 struct vnode *vp; 6999 int error; 7000 7001 if (!vfs_op_thread_enter(mp, mpcpu)) 7002 return (vfs_cache_root_fallback(mp, flags, vpp)); 7003 vp = atomic_load_ptr(&mp->mnt_rootvnode); 7004 if (vp == NULL || VN_IS_DOOMED(vp)) { 7005 vfs_op_thread_exit(mp, mpcpu); 7006 return (vfs_cache_root_fallback(mp, flags, vpp)); 7007 } 7008 vrefact(vp); 7009 vfs_op_thread_exit(mp, mpcpu); 7010 error = vn_lock(vp, flags); 7011 if (error != 0) { 7012 vrele(vp); 7013 return (vfs_cache_root_fallback(mp, flags, vpp)); 7014 } 7015 *vpp = vp; 7016 return (0); 7017 } 7018 7019 struct vnode * 7020 vfs_cache_root_clear(struct mount *mp) 7021 { 7022 struct vnode *vp; 7023 7024 /* 7025 * ops > 0 guarantees there is nobody who can see this vnode 7026 */ 7027 MPASS(mp->mnt_vfs_ops > 0); 7028 vp = mp->mnt_rootvnode; 7029 if (vp != NULL) 7030 vn_seqc_write_begin(vp); 7031 mp->mnt_rootvnode = NULL; 7032 return (vp); 7033 } 7034 7035 void 7036 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 7037 { 7038 7039 MPASS(mp->mnt_vfs_ops > 0); 7040 vrefact(vp); 7041 mp->mnt_rootvnode = vp; 7042 } 7043 7044 /* 7045 * These are helper functions for filesystems to traverse all 7046 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 7047 * 7048 * This interface replaces MNT_VNODE_FOREACH. 7049 */ 7050 7051 struct vnode * 7052 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 7053 { 7054 struct vnode *vp; 7055 7056 maybe_yield(); 7057 MNT_ILOCK(mp); 7058 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7059 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 7060 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 7061 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 7062 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 7063 continue; 7064 VI_LOCK(vp); 7065 if (VN_IS_DOOMED(vp)) { 7066 VI_UNLOCK(vp); 7067 continue; 7068 } 7069 break; 7070 } 7071 if (vp == NULL) { 7072 __mnt_vnode_markerfree_all(mvp, mp); 7073 /* MNT_IUNLOCK(mp); -- done in above function */ 7074 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 7075 return (NULL); 7076 } 7077 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 7078 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 7079 MNT_IUNLOCK(mp); 7080 return (vp); 7081 } 7082 7083 struct vnode * 7084 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 7085 { 7086 struct vnode *vp; 7087 7088 *mvp = vn_alloc_marker(mp); 7089 MNT_ILOCK(mp); 7090 MNT_REF(mp); 7091 7092 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 7093 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 7094 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 7095 continue; 7096 VI_LOCK(vp); 7097 if (VN_IS_DOOMED(vp)) { 7098 VI_UNLOCK(vp); 7099 continue; 7100 } 7101 break; 7102 } 7103 if (vp == NULL) { 7104 MNT_REL(mp); 7105 MNT_IUNLOCK(mp); 7106 vn_free_marker(*mvp); 7107 *mvp = NULL; 7108 return (NULL); 7109 } 7110 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 7111 MNT_IUNLOCK(mp); 7112 return (vp); 7113 } 7114 7115 void 7116 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 7117 { 7118 7119 if (*mvp == NULL) { 7120 MNT_IUNLOCK(mp); 7121 return; 7122 } 7123 7124 mtx_assert(MNT_MTX(mp), MA_OWNED); 7125 7126 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7127 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 7128 MNT_REL(mp); 7129 MNT_IUNLOCK(mp); 7130 vn_free_marker(*mvp); 7131 *mvp = NULL; 7132 } 7133 7134 /* 7135 * These are helper functions for filesystems to traverse their 7136 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 7137 */ 7138 static void 7139 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 7140 { 7141 7142 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7143 7144 MNT_ILOCK(mp); 7145 MNT_REL(mp); 7146 MNT_IUNLOCK(mp); 7147 vn_free_marker(*mvp); 7148 *mvp = NULL; 7149 } 7150 7151 /* 7152 * Relock the mp mount vnode list lock with the vp vnode interlock in the 7153 * conventional lock order during mnt_vnode_next_lazy iteration. 7154 * 7155 * On entry, the mount vnode list lock is held and the vnode interlock is not. 7156 * The list lock is dropped and reacquired. On success, both locks are held. 7157 * On failure, the mount vnode list lock is held but the vnode interlock is 7158 * not, and the procedure may have yielded. 7159 */ 7160 static bool 7161 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 7162 struct vnode *vp) 7163 { 7164 7165 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 7166 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 7167 ("%s: bad marker", __func__)); 7168 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 7169 ("%s: inappropriate vnode", __func__)); 7170 ASSERT_VI_UNLOCKED(vp, __func__); 7171 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 7172 7173 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 7174 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 7175 7176 /* 7177 * Note we may be racing against vdrop which transitioned the hold 7178 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 7179 * if we are the only user after we get the interlock we will just 7180 * vdrop. 7181 */ 7182 vhold(vp); 7183 mtx_unlock(&mp->mnt_listmtx); 7184 VI_LOCK(vp); 7185 if (VN_IS_DOOMED(vp)) { 7186 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 7187 goto out_lost; 7188 } 7189 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 7190 /* 7191 * There is nothing to do if we are the last user. 7192 */ 7193 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 7194 goto out_lost; 7195 mtx_lock(&mp->mnt_listmtx); 7196 return (true); 7197 out_lost: 7198 vdropl(vp); 7199 maybe_yield(); 7200 mtx_lock(&mp->mnt_listmtx); 7201 return (false); 7202 } 7203 7204 static struct vnode * 7205 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7206 void *cbarg) 7207 { 7208 struct vnode *vp; 7209 7210 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 7211 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 7212 restart: 7213 vp = TAILQ_NEXT(*mvp, v_lazylist); 7214 while (vp != NULL) { 7215 if (vp->v_type == VMARKER) { 7216 vp = TAILQ_NEXT(vp, v_lazylist); 7217 continue; 7218 } 7219 /* 7220 * See if we want to process the vnode. Note we may encounter a 7221 * long string of vnodes we don't care about and hog the list 7222 * as a result. Check for it and requeue the marker. 7223 */ 7224 VNPASS(!VN_IS_DOOMED(vp), vp); 7225 if (!cb(vp, cbarg)) { 7226 if (!should_yield()) { 7227 vp = TAILQ_NEXT(vp, v_lazylist); 7228 continue; 7229 } 7230 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 7231 v_lazylist); 7232 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 7233 v_lazylist); 7234 mtx_unlock(&mp->mnt_listmtx); 7235 kern_yield(PRI_USER); 7236 mtx_lock(&mp->mnt_listmtx); 7237 goto restart; 7238 } 7239 /* 7240 * Try-lock because this is the wrong lock order. 7241 */ 7242 if (!VI_TRYLOCK(vp) && 7243 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 7244 goto restart; 7245 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 7246 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 7247 ("alien vnode on the lazy list %p %p", vp, mp)); 7248 VNPASS(vp->v_mount == mp, vp); 7249 VNPASS(!VN_IS_DOOMED(vp), vp); 7250 break; 7251 } 7252 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 7253 7254 /* Check if we are done */ 7255 if (vp == NULL) { 7256 mtx_unlock(&mp->mnt_listmtx); 7257 mnt_vnode_markerfree_lazy(mvp, mp); 7258 return (NULL); 7259 } 7260 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 7261 mtx_unlock(&mp->mnt_listmtx); 7262 ASSERT_VI_LOCKED(vp, "lazy iter"); 7263 return (vp); 7264 } 7265 7266 struct vnode * 7267 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7268 void *cbarg) 7269 { 7270 7271 maybe_yield(); 7272 mtx_lock(&mp->mnt_listmtx); 7273 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 7274 } 7275 7276 struct vnode * 7277 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7278 void *cbarg) 7279 { 7280 struct vnode *vp; 7281 7282 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 7283 return (NULL); 7284 7285 *mvp = vn_alloc_marker(mp); 7286 MNT_ILOCK(mp); 7287 MNT_REF(mp); 7288 MNT_IUNLOCK(mp); 7289 7290 mtx_lock(&mp->mnt_listmtx); 7291 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 7292 if (vp == NULL) { 7293 mtx_unlock(&mp->mnt_listmtx); 7294 mnt_vnode_markerfree_lazy(mvp, mp); 7295 return (NULL); 7296 } 7297 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 7298 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 7299 } 7300 7301 void 7302 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 7303 { 7304 7305 if (*mvp == NULL) 7306 return; 7307 7308 mtx_lock(&mp->mnt_listmtx); 7309 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 7310 mtx_unlock(&mp->mnt_listmtx); 7311 mnt_vnode_markerfree_lazy(mvp, mp); 7312 } 7313 7314 int 7315 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 7316 { 7317 7318 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 7319 cnp->cn_flags &= ~NOEXECCHECK; 7320 return (0); 7321 } 7322 7323 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); 7324 } 7325 7326 /* 7327 * Do not use this variant unless you have means other than the hold count 7328 * to prevent the vnode from getting freed. 7329 */ 7330 void 7331 vn_seqc_write_begin_locked(struct vnode *vp) 7332 { 7333 7334 ASSERT_VI_LOCKED(vp, __func__); 7335 VNPASS(vp->v_holdcnt > 0, vp); 7336 VNPASS(vp->v_seqc_users >= 0, vp); 7337 vp->v_seqc_users++; 7338 if (vp->v_seqc_users == 1) 7339 seqc_sleepable_write_begin(&vp->v_seqc); 7340 } 7341 7342 void 7343 vn_seqc_write_begin(struct vnode *vp) 7344 { 7345 7346 VI_LOCK(vp); 7347 vn_seqc_write_begin_locked(vp); 7348 VI_UNLOCK(vp); 7349 } 7350 7351 void 7352 vn_seqc_write_end_locked(struct vnode *vp) 7353 { 7354 7355 ASSERT_VI_LOCKED(vp, __func__); 7356 VNPASS(vp->v_seqc_users > 0, vp); 7357 vp->v_seqc_users--; 7358 if (vp->v_seqc_users == 0) 7359 seqc_sleepable_write_end(&vp->v_seqc); 7360 } 7361 7362 void 7363 vn_seqc_write_end(struct vnode *vp) 7364 { 7365 7366 VI_LOCK(vp); 7367 vn_seqc_write_end_locked(vp); 7368 VI_UNLOCK(vp); 7369 } 7370 7371 /* 7372 * Special case handling for allocating and freeing vnodes. 7373 * 7374 * The counter remains unchanged on free so that a doomed vnode will 7375 * keep testing as in modify as long as it is accessible with SMR. 7376 */ 7377 static void 7378 vn_seqc_init(struct vnode *vp) 7379 { 7380 7381 vp->v_seqc = 0; 7382 vp->v_seqc_users = 0; 7383 } 7384 7385 static void 7386 vn_seqc_write_end_free(struct vnode *vp) 7387 { 7388 7389 VNPASS(seqc_in_modify(vp->v_seqc), vp); 7390 VNPASS(vp->v_seqc_users == 1, vp); 7391 } 7392 7393 void 7394 vn_irflag_set_locked(struct vnode *vp, short toset) 7395 { 7396 short flags; 7397 7398 ASSERT_VI_LOCKED(vp, __func__); 7399 flags = vn_irflag_read(vp); 7400 VNASSERT((flags & toset) == 0, vp, 7401 ("%s: some of the passed flags already set (have %d, passed %d)\n", 7402 __func__, flags, toset)); 7403 atomic_store_short(&vp->v_irflag, flags | toset); 7404 } 7405 7406 void 7407 vn_irflag_set(struct vnode *vp, short toset) 7408 { 7409 7410 VI_LOCK(vp); 7411 vn_irflag_set_locked(vp, toset); 7412 VI_UNLOCK(vp); 7413 } 7414 7415 void 7416 vn_irflag_set_cond_locked(struct vnode *vp, short toset) 7417 { 7418 short flags; 7419 7420 ASSERT_VI_LOCKED(vp, __func__); 7421 flags = vn_irflag_read(vp); 7422 atomic_store_short(&vp->v_irflag, flags | toset); 7423 } 7424 7425 void 7426 vn_irflag_set_cond(struct vnode *vp, short toset) 7427 { 7428 7429 VI_LOCK(vp); 7430 vn_irflag_set_cond_locked(vp, toset); 7431 VI_UNLOCK(vp); 7432 } 7433 7434 void 7435 vn_irflag_unset_locked(struct vnode *vp, short tounset) 7436 { 7437 short flags; 7438 7439 ASSERT_VI_LOCKED(vp, __func__); 7440 flags = vn_irflag_read(vp); 7441 VNASSERT((flags & tounset) == tounset, vp, 7442 ("%s: some of the passed flags not set (have %d, passed %d)\n", 7443 __func__, flags, tounset)); 7444 atomic_store_short(&vp->v_irflag, flags & ~tounset); 7445 } 7446 7447 void 7448 vn_irflag_unset(struct vnode *vp, short tounset) 7449 { 7450 7451 VI_LOCK(vp); 7452 vn_irflag_unset_locked(vp, tounset); 7453 VI_UNLOCK(vp); 7454 } 7455 7456 int 7457 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred) 7458 { 7459 struct vattr vattr; 7460 int error; 7461 7462 ASSERT_VOP_LOCKED(vp, __func__); 7463 error = VOP_GETATTR(vp, &vattr, cred); 7464 if (__predict_true(error == 0)) { 7465 if (vattr.va_size <= OFF_MAX) 7466 *size = vattr.va_size; 7467 else 7468 error = EFBIG; 7469 } 7470 return (error); 7471 } 7472 7473 int 7474 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred) 7475 { 7476 int error; 7477 7478 VOP_LOCK(vp, LK_SHARED); 7479 error = vn_getsize_locked(vp, size, cred); 7480 VOP_UNLOCK(vp); 7481 return (error); 7482 } 7483 7484 #ifdef INVARIANTS 7485 void 7486 vn_set_state_validate(struct vnode *vp, __enum_uint8(vstate) state) 7487 { 7488 7489 switch (vp->v_state) { 7490 case VSTATE_UNINITIALIZED: 7491 switch (state) { 7492 case VSTATE_CONSTRUCTED: 7493 case VSTATE_DESTROYING: 7494 return; 7495 default: 7496 break; 7497 } 7498 break; 7499 case VSTATE_CONSTRUCTED: 7500 ASSERT_VOP_ELOCKED(vp, __func__); 7501 switch (state) { 7502 case VSTATE_DESTROYING: 7503 return; 7504 default: 7505 break; 7506 } 7507 break; 7508 case VSTATE_DESTROYING: 7509 ASSERT_VOP_ELOCKED(vp, __func__); 7510 switch (state) { 7511 case VSTATE_DEAD: 7512 return; 7513 default: 7514 break; 7515 } 7516 break; 7517 case VSTATE_DEAD: 7518 switch (state) { 7519 case VSTATE_UNINITIALIZED: 7520 return; 7521 default: 7522 break; 7523 } 7524 break; 7525 } 7526 7527 vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state); 7528 panic("invalid state transition %d -> %d\n", vp->v_state, state); 7529 } 7530 #endif 7531