1 /*- 2 * Copyright (c) 2004 Poul-Henning Kamp 3 * Copyright (c) 1994,1997 John S. Dyson 4 * Copyright (c) 2013 The FreeBSD Foundation 5 * All rights reserved. 6 * 7 * Portions of this software were developed by Konstantin Belousov 8 * under sponsorship from the FreeBSD Foundation. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * this file contains a new buffer I/O scheme implementing a coherent 34 * VM object and buffer cache scheme. Pains have been taken to make 35 * sure that the performance degradation associated with schemes such 36 * as this is not realized. 37 * 38 * Author: John S. Dyson 39 * Significant help during the development and debugging phases 40 * had been provided by David Greenman, also of the FreeBSD core team. 41 * 42 * see man buf(9) for more info. 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/conf.h> 52 #include <sys/buf.h> 53 #include <sys/devicestat.h> 54 #include <sys/eventhandler.h> 55 #include <sys/fail.h> 56 #include <sys/limits.h> 57 #include <sys/lock.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/mutex.h> 61 #include <sys/kernel.h> 62 #include <sys/kthread.h> 63 #include <sys/proc.h> 64 #include <sys/racct.h> 65 #include <sys/resourcevar.h> 66 #include <sys/rwlock.h> 67 #include <sys/smp.h> 68 #include <sys/sysctl.h> 69 #include <sys/sysproto.h> 70 #include <sys/vmem.h> 71 #include <sys/vmmeter.h> 72 #include <sys/vnode.h> 73 #include <sys/watchdog.h> 74 #include <geom/geom.h> 75 #include <vm/vm.h> 76 #include <vm/vm_param.h> 77 #include <vm/vm_kern.h> 78 #include <vm/vm_object.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_pageout.h> 81 #include <vm/vm_pager.h> 82 #include <vm/vm_extern.h> 83 #include <vm/vm_map.h> 84 #include <vm/swap_pager.h> 85 #include "opt_compat.h" 86 #include "opt_swap.h" 87 88 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); 89 90 struct bio_ops bioops; /* I/O operation notification */ 91 92 struct buf_ops buf_ops_bio = { 93 .bop_name = "buf_ops_bio", 94 .bop_write = bufwrite, 95 .bop_strategy = bufstrategy, 96 .bop_sync = bufsync, 97 .bop_bdflush = bufbdflush, 98 }; 99 100 static struct buf *buf; /* buffer header pool */ 101 extern struct buf *swbuf; /* Swap buffer header pool. */ 102 caddr_t unmapped_buf; 103 104 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ 105 struct proc *bufdaemonproc; 106 struct proc *bufspacedaemonproc; 107 108 static int inmem(struct vnode *vp, daddr_t blkno); 109 static void vm_hold_free_pages(struct buf *bp, int newbsize); 110 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, 111 vm_offset_t to); 112 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); 113 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, 114 vm_page_t m); 115 static void vfs_clean_pages_dirty_buf(struct buf *bp); 116 static void vfs_setdirty_locked_object(struct buf *bp); 117 static void vfs_vmio_invalidate(struct buf *bp); 118 static void vfs_vmio_truncate(struct buf *bp, int npages); 119 static void vfs_vmio_extend(struct buf *bp, int npages, int size); 120 static int vfs_bio_clcheck(struct vnode *vp, int size, 121 daddr_t lblkno, daddr_t blkno); 122 static int buf_flush(struct vnode *vp, int); 123 static int buf_recycle(bool); 124 static int buf_scan(bool); 125 static int flushbufqueues(struct vnode *, int, int); 126 static void buf_daemon(void); 127 static void bremfreel(struct buf *bp); 128 static __inline void bd_wakeup(void); 129 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); 130 static void bufkva_reclaim(vmem_t *, int); 131 static void bufkva_free(struct buf *); 132 static int buf_import(void *, void **, int, int); 133 static void buf_release(void *, void **, int); 134 static void maxbcachebuf_adjust(void); 135 136 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ 137 defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) 138 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); 139 #endif 140 141 int vmiodirenable = TRUE; 142 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, 143 "Use the VM system for directory writes"); 144 long runningbufspace; 145 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, 146 "Amount of presently outstanding async buffer io"); 147 static long bufspace; 148 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ 149 defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) 150 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, 151 &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); 152 #else 153 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, 154 "Physical memory used for buffers"); 155 #endif 156 static long bufkvaspace; 157 SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, 158 "Kernel virtual memory used for buffers"); 159 static long maxbufspace; 160 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, 161 "Maximum allowed value of bufspace (including metadata)"); 162 static long bufmallocspace; 163 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, 164 "Amount of malloced memory for buffers"); 165 static long maxbufmallocspace; 166 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 167 0, "Maximum amount of malloced memory for buffers"); 168 static long lobufspace; 169 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0, 170 "Minimum amount of buffers we want to have"); 171 long hibufspace; 172 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0, 173 "Maximum allowed value of bufspace (excluding metadata)"); 174 long bufspacethresh; 175 SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, 176 0, "Bufspace consumed before waking the daemon to free some"); 177 static int buffreekvacnt; 178 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, 179 "Number of times we have freed the KVA space from some buffer"); 180 static int bufdefragcnt; 181 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, 182 "Number of times we have had to repeat buffer allocation to defragment"); 183 static long lorunningspace; 184 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | 185 CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", 186 "Minimum preferred space used for in-progress I/O"); 187 static long hirunningspace; 188 SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | 189 CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", 190 "Maximum amount of space to use for in-progress I/O"); 191 int dirtybufferflushes; 192 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 193 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); 194 int bdwriteskip; 195 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 196 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); 197 int altbufferflushes; 198 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 199 0, "Number of fsync flushes to limit dirty buffers"); 200 static int recursiveflushes; 201 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 202 0, "Number of flushes skipped due to being recursive"); 203 static int numdirtybuffers; 204 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, 205 "Number of buffers that are dirty (has unwritten changes) at the moment"); 206 static int lodirtybuffers; 207 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, 208 "How many buffers we want to have free before bufdaemon can sleep"); 209 static int hidirtybuffers; 210 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, 211 "When the number of dirty buffers is considered severe"); 212 int dirtybufthresh; 213 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 214 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); 215 static int numfreebuffers; 216 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, 217 "Number of free buffers"); 218 static int lofreebuffers; 219 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, 220 "Target number of free buffers"); 221 static int hifreebuffers; 222 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, 223 "Threshold for clean buffer recycling"); 224 static int getnewbufcalls; 225 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, 226 "Number of calls to getnewbuf"); 227 static int getnewbufrestarts; 228 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, 229 "Number of times getnewbuf has had to restart a buffer acquisition"); 230 static int mappingrestarts; 231 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, 232 "Number of times getblk has had to restart a buffer mapping for " 233 "unmapped buffer"); 234 static int numbufallocfails; 235 SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, 236 "Number of times buffer allocations failed"); 237 static int flushbufqtarget = 100; 238 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, 239 "Amount of work to do in flushbufqueues when helping bufdaemon"); 240 static long notbufdflushes; 241 SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, 242 "Number of dirty buffer flushes done by the bufdaemon helpers"); 243 static long barrierwrites; 244 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, 245 "Number of barrier writes"); 246 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, 247 &unmapped_buf_allowed, 0, 248 "Permit the use of the unmapped i/o"); 249 int maxbcachebuf = MAXBCACHEBUF; 250 SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, 251 "Maximum size of a buffer cache block"); 252 253 /* 254 * This lock synchronizes access to bd_request. 255 */ 256 static struct mtx_padalign bdlock; 257 258 /* 259 * This lock protects the runningbufreq and synchronizes runningbufwakeup and 260 * waitrunningbufspace(). 261 */ 262 static struct mtx_padalign rbreqlock; 263 264 /* 265 * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. 266 */ 267 static struct rwlock_padalign nblock; 268 269 /* 270 * Lock that protects bdirtywait. 271 */ 272 static struct mtx_padalign bdirtylock; 273 274 /* 275 * Wakeup point for bufdaemon, as well as indicator of whether it is already 276 * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it 277 * is idling. 278 */ 279 static int bd_request; 280 281 /* 282 * Request/wakeup point for the bufspace daemon. 283 */ 284 static int bufspace_request; 285 286 /* 287 * Request for the buf daemon to write more buffers than is indicated by 288 * lodirtybuf. This may be necessary to push out excess dependencies or 289 * defragment the address space where a simple count of the number of dirty 290 * buffers is insufficient to characterize the demand for flushing them. 291 */ 292 static int bd_speedupreq; 293 294 /* 295 * Synchronization (sleep/wakeup) variable for active buffer space requests. 296 * Set when wait starts, cleared prior to wakeup(). 297 * Used in runningbufwakeup() and waitrunningbufspace(). 298 */ 299 static int runningbufreq; 300 301 /* 302 * Synchronization (sleep/wakeup) variable for buffer requests. 303 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done 304 * by and/or. 305 * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), 306 * getnewbuf(), and getblk(). 307 */ 308 static volatile int needsbuffer; 309 310 /* 311 * Synchronization for bwillwrite() waiters. 312 */ 313 static int bdirtywait; 314 315 /* 316 * Definitions for the buffer free lists. 317 */ 318 #define QUEUE_NONE 0 /* on no queue */ 319 #define QUEUE_EMPTY 1 /* empty buffer headers */ 320 #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ 321 #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ 322 #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ 323 324 /* Maximum number of clean buffer queues. */ 325 #define CLEAN_QUEUES 16 326 327 /* Configured number of clean queues. */ 328 static int clean_queues; 329 330 /* Maximum number of buffer queues. */ 331 #define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) 332 333 /* Queues for free buffers with various properties */ 334 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; 335 #ifdef INVARIANTS 336 static int bq_len[BUFFER_QUEUES]; 337 #endif 338 339 /* 340 * Lock for each bufqueue 341 */ 342 static struct mtx_padalign bqlocks[BUFFER_QUEUES]; 343 344 /* 345 * per-cpu empty buffer cache. 346 */ 347 uma_zone_t buf_zone; 348 349 /* 350 * Single global constant for BUF_WMESG, to avoid getting multiple references. 351 * buf_wmesg is referred from macros. 352 */ 353 const char *buf_wmesg = BUF_WMESG; 354 355 static int 356 sysctl_runningspace(SYSCTL_HANDLER_ARGS) 357 { 358 long value; 359 int error; 360 361 value = *(long *)arg1; 362 error = sysctl_handle_long(oidp, &value, 0, req); 363 if (error != 0 || req->newptr == NULL) 364 return (error); 365 mtx_lock(&rbreqlock); 366 if (arg1 == &hirunningspace) { 367 if (value < lorunningspace) 368 error = EINVAL; 369 else 370 hirunningspace = value; 371 } else { 372 KASSERT(arg1 == &lorunningspace, 373 ("%s: unknown arg1", __func__)); 374 if (value > hirunningspace) 375 error = EINVAL; 376 else 377 lorunningspace = value; 378 } 379 mtx_unlock(&rbreqlock); 380 return (error); 381 } 382 383 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ 384 defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) 385 static int 386 sysctl_bufspace(SYSCTL_HANDLER_ARGS) 387 { 388 long lvalue; 389 int ivalue; 390 391 if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) 392 return (sysctl_handle_long(oidp, arg1, arg2, req)); 393 lvalue = *(long *)arg1; 394 if (lvalue > INT_MAX) 395 /* On overflow, still write out a long to trigger ENOMEM. */ 396 return (sysctl_handle_long(oidp, &lvalue, 0, req)); 397 ivalue = lvalue; 398 return (sysctl_handle_int(oidp, &ivalue, 0, req)); 399 } 400 #endif 401 402 static int 403 bqcleanq(void) 404 { 405 static int nextq; 406 407 return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); 408 } 409 410 static int 411 bqisclean(int qindex) 412 { 413 414 return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); 415 } 416 417 /* 418 * bqlock: 419 * 420 * Return the appropriate queue lock based on the index. 421 */ 422 static inline struct mtx * 423 bqlock(int qindex) 424 { 425 426 return (struct mtx *)&bqlocks[qindex]; 427 } 428 429 /* 430 * bdirtywakeup: 431 * 432 * Wakeup any bwillwrite() waiters. 433 */ 434 static void 435 bdirtywakeup(void) 436 { 437 mtx_lock(&bdirtylock); 438 if (bdirtywait) { 439 bdirtywait = 0; 440 wakeup(&bdirtywait); 441 } 442 mtx_unlock(&bdirtylock); 443 } 444 445 /* 446 * bdirtysub: 447 * 448 * Decrement the numdirtybuffers count by one and wakeup any 449 * threads blocked in bwillwrite(). 450 */ 451 static void 452 bdirtysub(void) 453 { 454 455 if (atomic_fetchadd_int(&numdirtybuffers, -1) == 456 (lodirtybuffers + hidirtybuffers) / 2) 457 bdirtywakeup(); 458 } 459 460 /* 461 * bdirtyadd: 462 * 463 * Increment the numdirtybuffers count by one and wakeup the buf 464 * daemon if needed. 465 */ 466 static void 467 bdirtyadd(void) 468 { 469 470 /* 471 * Only do the wakeup once as we cross the boundary. The 472 * buf daemon will keep running until the condition clears. 473 */ 474 if (atomic_fetchadd_int(&numdirtybuffers, 1) == 475 (lodirtybuffers + hidirtybuffers) / 2) 476 bd_wakeup(); 477 } 478 479 /* 480 * bufspace_wakeup: 481 * 482 * Called when buffer space is potentially available for recovery. 483 * getnewbuf() will block on this flag when it is unable to free 484 * sufficient buffer space. Buffer space becomes recoverable when 485 * bp's get placed back in the queues. 486 */ 487 static void 488 bufspace_wakeup(void) 489 { 490 491 /* 492 * If someone is waiting for bufspace, wake them up. 493 * 494 * Since needsbuffer is set prior to doing an additional queue 495 * scan it is safe to check for the flag prior to acquiring the 496 * lock. The thread that is preparing to scan again before 497 * blocking would discover the buf we released. 498 */ 499 if (needsbuffer) { 500 rw_rlock(&nblock); 501 if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) 502 wakeup(__DEVOLATILE(void *, &needsbuffer)); 503 rw_runlock(&nblock); 504 } 505 } 506 507 /* 508 * bufspace_daemonwakeup: 509 * 510 * Wakeup the daemon responsible for freeing clean bufs. 511 */ 512 static void 513 bufspace_daemonwakeup(void) 514 { 515 rw_rlock(&nblock); 516 if (bufspace_request == 0) { 517 bufspace_request = 1; 518 wakeup(&bufspace_request); 519 } 520 rw_runlock(&nblock); 521 } 522 523 /* 524 * bufspace_adjust: 525 * 526 * Adjust the reported bufspace for a KVA managed buffer, possibly 527 * waking any waiters. 528 */ 529 static void 530 bufspace_adjust(struct buf *bp, int bufsize) 531 { 532 long space; 533 int diff; 534 535 KASSERT((bp->b_flags & B_MALLOC) == 0, 536 ("bufspace_adjust: malloc buf %p", bp)); 537 diff = bufsize - bp->b_bufsize; 538 if (diff < 0) { 539 atomic_subtract_long(&bufspace, -diff); 540 bufspace_wakeup(); 541 } else { 542 space = atomic_fetchadd_long(&bufspace, diff); 543 /* Wake up the daemon on the transition. */ 544 if (space < bufspacethresh && space + diff >= bufspacethresh) 545 bufspace_daemonwakeup(); 546 } 547 bp->b_bufsize = bufsize; 548 } 549 550 /* 551 * bufspace_reserve: 552 * 553 * Reserve bufspace before calling allocbuf(). metadata has a 554 * different space limit than data. 555 */ 556 static int 557 bufspace_reserve(int size, bool metadata) 558 { 559 long limit; 560 long space; 561 562 if (metadata) 563 limit = maxbufspace; 564 else 565 limit = hibufspace; 566 do { 567 space = bufspace; 568 if (space + size > limit) 569 return (ENOSPC); 570 } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); 571 572 /* Wake up the daemon on the transition. */ 573 if (space < bufspacethresh && space + size >= bufspacethresh) 574 bufspace_daemonwakeup(); 575 576 return (0); 577 } 578 579 /* 580 * bufspace_release: 581 * 582 * Release reserved bufspace after bufspace_adjust() has consumed it. 583 */ 584 static void 585 bufspace_release(int size) 586 { 587 atomic_subtract_long(&bufspace, size); 588 bufspace_wakeup(); 589 } 590 591 /* 592 * bufspace_wait: 593 * 594 * Wait for bufspace, acting as the buf daemon if a locked vnode is 595 * supplied. needsbuffer must be set in a safe fashion prior to 596 * polling for space. The operation must be re-tried on return. 597 */ 598 static void 599 bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) 600 { 601 struct thread *td; 602 int error, fl, norunbuf; 603 604 if ((gbflags & GB_NOWAIT_BD) != 0) 605 return; 606 607 td = curthread; 608 rw_wlock(&nblock); 609 while (needsbuffer != 0) { 610 if (vp != NULL && vp->v_type != VCHR && 611 (td->td_pflags & TDP_BUFNEED) == 0) { 612 rw_wunlock(&nblock); 613 /* 614 * getblk() is called with a vnode locked, and 615 * some majority of the dirty buffers may as 616 * well belong to the vnode. Flushing the 617 * buffers there would make a progress that 618 * cannot be achieved by the buf_daemon, that 619 * cannot lock the vnode. 620 */ 621 norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | 622 (td->td_pflags & TDP_NORUNNINGBUF); 623 624 /* 625 * Play bufdaemon. The getnewbuf() function 626 * may be called while the thread owns lock 627 * for another dirty buffer for the same 628 * vnode, which makes it impossible to use 629 * VOP_FSYNC() there, due to the buffer lock 630 * recursion. 631 */ 632 td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; 633 fl = buf_flush(vp, flushbufqtarget); 634 td->td_pflags &= norunbuf; 635 rw_wlock(&nblock); 636 if (fl != 0) 637 continue; 638 if (needsbuffer == 0) 639 break; 640 } 641 error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, 642 (PRIBIO + 4) | slpflag, "newbuf", slptimeo); 643 if (error != 0) 644 break; 645 } 646 rw_wunlock(&nblock); 647 } 648 649 650 /* 651 * bufspace_daemon: 652 * 653 * buffer space management daemon. Tries to maintain some marginal 654 * amount of free buffer space so that requesting processes neither 655 * block nor work to reclaim buffers. 656 */ 657 static void 658 bufspace_daemon(void) 659 { 660 for (;;) { 661 kproc_suspend_check(bufspacedaemonproc); 662 663 /* 664 * Free buffers from the clean queue until we meet our 665 * targets. 666 * 667 * Theory of operation: The buffer cache is most efficient 668 * when some free buffer headers and space are always 669 * available to getnewbuf(). This daemon attempts to prevent 670 * the excessive blocking and synchronization associated 671 * with shortfall. It goes through three phases according 672 * demand: 673 * 674 * 1) The daemon wakes up voluntarily once per-second 675 * during idle periods when the counters are below 676 * the wakeup thresholds (bufspacethresh, lofreebuffers). 677 * 678 * 2) The daemon wakes up as we cross the thresholds 679 * ahead of any potential blocking. This may bounce 680 * slightly according to the rate of consumption and 681 * release. 682 * 683 * 3) The daemon and consumers are starved for working 684 * clean buffers. This is the 'bufspace' sleep below 685 * which will inefficiently trade bufs with bqrelse 686 * until we return to condition 2. 687 */ 688 while (bufspace > lobufspace || 689 numfreebuffers < hifreebuffers) { 690 if (buf_recycle(false) != 0) { 691 atomic_set_int(&needsbuffer, 1); 692 if (buf_recycle(false) != 0) { 693 rw_wlock(&nblock); 694 if (needsbuffer) 695 rw_sleep(__DEVOLATILE(void *, 696 &needsbuffer), &nblock, 697 PRIBIO|PDROP, "bufspace", 698 hz/10); 699 else 700 rw_wunlock(&nblock); 701 } 702 } 703 maybe_yield(); 704 } 705 706 /* 707 * Re-check our limits under the exclusive nblock. 708 */ 709 rw_wlock(&nblock); 710 if (bufspace < bufspacethresh && 711 numfreebuffers > lofreebuffers) { 712 bufspace_request = 0; 713 rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, 714 "-", hz); 715 } else 716 rw_wunlock(&nblock); 717 } 718 } 719 720 static struct kproc_desc bufspace_kp = { 721 "bufspacedaemon", 722 bufspace_daemon, 723 &bufspacedaemonproc 724 }; 725 SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, 726 &bufspace_kp); 727 728 /* 729 * bufmallocadjust: 730 * 731 * Adjust the reported bufspace for a malloc managed buffer, possibly 732 * waking any waiters. 733 */ 734 static void 735 bufmallocadjust(struct buf *bp, int bufsize) 736 { 737 int diff; 738 739 KASSERT((bp->b_flags & B_MALLOC) != 0, 740 ("bufmallocadjust: non-malloc buf %p", bp)); 741 diff = bufsize - bp->b_bufsize; 742 if (diff < 0) 743 atomic_subtract_long(&bufmallocspace, -diff); 744 else 745 atomic_add_long(&bufmallocspace, diff); 746 bp->b_bufsize = bufsize; 747 } 748 749 /* 750 * runningwakeup: 751 * 752 * Wake up processes that are waiting on asynchronous writes to fall 753 * below lorunningspace. 754 */ 755 static void 756 runningwakeup(void) 757 { 758 759 mtx_lock(&rbreqlock); 760 if (runningbufreq) { 761 runningbufreq = 0; 762 wakeup(&runningbufreq); 763 } 764 mtx_unlock(&rbreqlock); 765 } 766 767 /* 768 * runningbufwakeup: 769 * 770 * Decrement the outstanding write count according. 771 */ 772 void 773 runningbufwakeup(struct buf *bp) 774 { 775 long space, bspace; 776 777 bspace = bp->b_runningbufspace; 778 if (bspace == 0) 779 return; 780 space = atomic_fetchadd_long(&runningbufspace, -bspace); 781 KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", 782 space, bspace)); 783 bp->b_runningbufspace = 0; 784 /* 785 * Only acquire the lock and wakeup on the transition from exceeding 786 * the threshold to falling below it. 787 */ 788 if (space < lorunningspace) 789 return; 790 if (space - bspace > lorunningspace) 791 return; 792 runningwakeup(); 793 } 794 795 /* 796 * waitrunningbufspace() 797 * 798 * runningbufspace is a measure of the amount of I/O currently 799 * running. This routine is used in async-write situations to 800 * prevent creating huge backups of pending writes to a device. 801 * Only asynchronous writes are governed by this function. 802 * 803 * This does NOT turn an async write into a sync write. It waits 804 * for earlier writes to complete and generally returns before the 805 * caller's write has reached the device. 806 */ 807 void 808 waitrunningbufspace(void) 809 { 810 811 mtx_lock(&rbreqlock); 812 while (runningbufspace > hirunningspace) { 813 runningbufreq = 1; 814 msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); 815 } 816 mtx_unlock(&rbreqlock); 817 } 818 819 820 /* 821 * vfs_buf_test_cache: 822 * 823 * Called when a buffer is extended. This function clears the B_CACHE 824 * bit if the newly extended portion of the buffer does not contain 825 * valid data. 826 */ 827 static __inline void 828 vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, 829 vm_offset_t size, vm_page_t m) 830 { 831 832 VM_OBJECT_ASSERT_LOCKED(m->object); 833 if (bp->b_flags & B_CACHE) { 834 int base = (foff + off) & PAGE_MASK; 835 if (vm_page_is_valid(m, base, size) == 0) 836 bp->b_flags &= ~B_CACHE; 837 } 838 } 839 840 /* Wake up the buffer daemon if necessary */ 841 static __inline void 842 bd_wakeup(void) 843 { 844 845 mtx_lock(&bdlock); 846 if (bd_request == 0) { 847 bd_request = 1; 848 wakeup(&bd_request); 849 } 850 mtx_unlock(&bdlock); 851 } 852 853 /* 854 * Adjust the maxbcachbuf tunable. 855 */ 856 static void 857 maxbcachebuf_adjust(void) 858 { 859 int i; 860 861 /* 862 * maxbcachebuf must be a power of 2 >= MAXBSIZE. 863 */ 864 i = 2; 865 while (i * 2 <= maxbcachebuf) 866 i *= 2; 867 maxbcachebuf = i; 868 if (maxbcachebuf < MAXBSIZE) 869 maxbcachebuf = MAXBSIZE; 870 if (maxbcachebuf > MAXPHYS) 871 maxbcachebuf = MAXPHYS; 872 if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) 873 printf("maxbcachebuf=%d\n", maxbcachebuf); 874 } 875 876 /* 877 * bd_speedup - speedup the buffer cache flushing code 878 */ 879 void 880 bd_speedup(void) 881 { 882 int needwake; 883 884 mtx_lock(&bdlock); 885 needwake = 0; 886 if (bd_speedupreq == 0 || bd_request == 0) 887 needwake = 1; 888 bd_speedupreq = 1; 889 bd_request = 1; 890 if (needwake) 891 wakeup(&bd_request); 892 mtx_unlock(&bdlock); 893 } 894 895 #ifndef NSWBUF_MIN 896 #define NSWBUF_MIN 16 897 #endif 898 899 #ifdef __i386__ 900 #define TRANSIENT_DENOM 5 901 #else 902 #define TRANSIENT_DENOM 10 903 #endif 904 905 /* 906 * Calculating buffer cache scaling values and reserve space for buffer 907 * headers. This is called during low level kernel initialization and 908 * may be called more then once. We CANNOT write to the memory area 909 * being reserved at this time. 910 */ 911 caddr_t 912 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) 913 { 914 int tuned_nbuf; 915 long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; 916 917 /* 918 * physmem_est is in pages. Convert it to kilobytes (assumes 919 * PAGE_SIZE is >= 1K) 920 */ 921 physmem_est = physmem_est * (PAGE_SIZE / 1024); 922 923 maxbcachebuf_adjust(); 924 /* 925 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 926 * For the first 64MB of ram nominally allocate sufficient buffers to 927 * cover 1/4 of our ram. Beyond the first 64MB allocate additional 928 * buffers to cover 1/10 of our ram over 64MB. When auto-sizing 929 * the buffer cache we limit the eventual kva reservation to 930 * maxbcache bytes. 931 * 932 * factor represents the 1/4 x ram conversion. 933 */ 934 if (nbuf == 0) { 935 int factor = 4 * BKVASIZE / 1024; 936 937 nbuf = 50; 938 if (physmem_est > 4096) 939 nbuf += min((physmem_est - 4096) / factor, 940 65536 / factor); 941 if (physmem_est > 65536) 942 nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 943 32 * 1024 * 1024 / (factor * 5)); 944 945 if (maxbcache && nbuf > maxbcache / BKVASIZE) 946 nbuf = maxbcache / BKVASIZE; 947 tuned_nbuf = 1; 948 } else 949 tuned_nbuf = 0; 950 951 /* XXX Avoid unsigned long overflows later on with maxbufspace. */ 952 maxbuf = (LONG_MAX / 3) / BKVASIZE; 953 if (nbuf > maxbuf) { 954 if (!tuned_nbuf) 955 printf("Warning: nbufs lowered from %d to %ld\n", nbuf, 956 maxbuf); 957 nbuf = maxbuf; 958 } 959 960 /* 961 * Ideal allocation size for the transient bio submap is 10% 962 * of the maximal space buffer map. This roughly corresponds 963 * to the amount of the buffer mapped for typical UFS load. 964 * 965 * Clip the buffer map to reserve space for the transient 966 * BIOs, if its extent is bigger than 90% (80% on i386) of the 967 * maximum buffer map extent on the platform. 968 * 969 * The fall-back to the maxbuf in case of maxbcache unset, 970 * allows to not trim the buffer KVA for the architectures 971 * with ample KVA space. 972 */ 973 if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { 974 maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; 975 buf_sz = (long)nbuf * BKVASIZE; 976 if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * 977 (TRANSIENT_DENOM - 1)) { 978 /* 979 * There is more KVA than memory. Do not 980 * adjust buffer map size, and assign the rest 981 * of maxbuf to transient map. 982 */ 983 biotmap_sz = maxbuf_sz - buf_sz; 984 } else { 985 /* 986 * Buffer map spans all KVA we could afford on 987 * this platform. Give 10% (20% on i386) of 988 * the buffer map to the transient bio map. 989 */ 990 biotmap_sz = buf_sz / TRANSIENT_DENOM; 991 buf_sz -= biotmap_sz; 992 } 993 if (biotmap_sz / INT_MAX > MAXPHYS) 994 bio_transient_maxcnt = INT_MAX; 995 else 996 bio_transient_maxcnt = biotmap_sz / MAXPHYS; 997 /* 998 * Artificially limit to 1024 simultaneous in-flight I/Os 999 * using the transient mapping. 1000 */ 1001 if (bio_transient_maxcnt > 1024) 1002 bio_transient_maxcnt = 1024; 1003 if (tuned_nbuf) 1004 nbuf = buf_sz / BKVASIZE; 1005 } 1006 1007 /* 1008 * swbufs are used as temporary holders for I/O, such as paging I/O. 1009 * We have no less then 16 and no more then 256. 1010 */ 1011 nswbuf = min(nbuf / 4, 256); 1012 TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); 1013 if (nswbuf < NSWBUF_MIN) 1014 nswbuf = NSWBUF_MIN; 1015 1016 /* 1017 * Reserve space for the buffer cache buffers 1018 */ 1019 swbuf = (void *)v; 1020 v = (caddr_t)(swbuf + nswbuf); 1021 buf = (void *)v; 1022 v = (caddr_t)(buf + nbuf); 1023 1024 return(v); 1025 } 1026 1027 /* Initialize the buffer subsystem. Called before use of any buffers. */ 1028 void 1029 bufinit(void) 1030 { 1031 struct buf *bp; 1032 int i; 1033 1034 KASSERT(maxbcachebuf >= MAXBSIZE, 1035 ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, 1036 MAXBSIZE)); 1037 mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); 1038 mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); 1039 for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) 1040 mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); 1041 mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); 1042 rw_init(&nblock, "needsbuffer lock"); 1043 mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); 1044 mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); 1045 1046 /* next, make a null set of free lists */ 1047 for (i = 0; i < BUFFER_QUEUES; i++) 1048 TAILQ_INIT(&bufqueues[i]); 1049 1050 unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); 1051 1052 /* finally, initialize each buffer header and stick on empty q */ 1053 for (i = 0; i < nbuf; i++) { 1054 bp = &buf[i]; 1055 bzero(bp, sizeof *bp); 1056 bp->b_flags = B_INVAL; 1057 bp->b_rcred = NOCRED; 1058 bp->b_wcred = NOCRED; 1059 bp->b_qindex = QUEUE_EMPTY; 1060 bp->b_xflags = 0; 1061 bp->b_data = bp->b_kvabase = unmapped_buf; 1062 LIST_INIT(&bp->b_dep); 1063 BUF_LOCKINIT(bp); 1064 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 1065 #ifdef INVARIANTS 1066 bq_len[QUEUE_EMPTY]++; 1067 #endif 1068 } 1069 1070 /* 1071 * maxbufspace is the absolute maximum amount of buffer space we are 1072 * allowed to reserve in KVM and in real terms. The absolute maximum 1073 * is nominally used by metadata. hibufspace is the nominal maximum 1074 * used by most other requests. The differential is required to 1075 * ensure that metadata deadlocks don't occur. 1076 * 1077 * maxbufspace is based on BKVASIZE. Allocating buffers larger then 1078 * this may result in KVM fragmentation which is not handled optimally 1079 * by the system. XXX This is less true with vmem. We could use 1080 * PAGE_SIZE. 1081 */ 1082 maxbufspace = (long)nbuf * BKVASIZE; 1083 hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); 1084 lobufspace = (hibufspace / 20) * 19; /* 95% */ 1085 bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; 1086 1087 /* 1088 * Note: The 16 MiB upper limit for hirunningspace was chosen 1089 * arbitrarily and may need further tuning. It corresponds to 1090 * 128 outstanding write IO requests (if IO size is 128 KiB), 1091 * which fits with many RAID controllers' tagged queuing limits. 1092 * The lower 1 MiB limit is the historical upper limit for 1093 * hirunningspace. 1094 */ 1095 hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), 1096 16 * 1024 * 1024), 1024 * 1024); 1097 lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); 1098 1099 /* 1100 * Limit the amount of malloc memory since it is wired permanently into 1101 * the kernel space. Even though this is accounted for in the buffer 1102 * allocation, we don't want the malloced region to grow uncontrolled. 1103 * The malloc scheme improves memory utilization significantly on 1104 * average (small) directories. 1105 */ 1106 maxbufmallocspace = hibufspace / 20; 1107 1108 /* 1109 * Reduce the chance of a deadlock occurring by limiting the number 1110 * of delayed-write dirty buffers we allow to stack up. 1111 */ 1112 hidirtybuffers = nbuf / 4 + 20; 1113 dirtybufthresh = hidirtybuffers * 9 / 10; 1114 numdirtybuffers = 0; 1115 /* 1116 * To support extreme low-memory systems, make sure hidirtybuffers 1117 * cannot eat up all available buffer space. This occurs when our 1118 * minimum cannot be met. We try to size hidirtybuffers to 3/4 our 1119 * buffer space assuming BKVASIZE'd buffers. 1120 */ 1121 while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { 1122 hidirtybuffers >>= 1; 1123 } 1124 lodirtybuffers = hidirtybuffers / 2; 1125 1126 /* 1127 * lofreebuffers should be sufficient to avoid stalling waiting on 1128 * buf headers under heavy utilization. The bufs in per-cpu caches 1129 * are counted as free but will be unavailable to threads executing 1130 * on other cpus. 1131 * 1132 * hifreebuffers is the free target for the bufspace daemon. This 1133 * should be set appropriately to limit work per-iteration. 1134 */ 1135 lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); 1136 hifreebuffers = (3 * lofreebuffers) / 2; 1137 numfreebuffers = nbuf; 1138 1139 /* Setup the kva and free list allocators. */ 1140 vmem_set_reclaim(buffer_arena, bufkva_reclaim); 1141 buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), 1142 NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); 1143 1144 /* 1145 * Size the clean queue according to the amount of buffer space. 1146 * One queue per-256mb up to the max. More queues gives better 1147 * concurrency but less accurate LRU. 1148 */ 1149 clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); 1150 1151 } 1152 1153 #ifdef INVARIANTS 1154 static inline void 1155 vfs_buf_check_mapped(struct buf *bp) 1156 { 1157 1158 KASSERT(bp->b_kvabase != unmapped_buf, 1159 ("mapped buf: b_kvabase was not updated %p", bp)); 1160 KASSERT(bp->b_data != unmapped_buf, 1161 ("mapped buf: b_data was not updated %p", bp)); 1162 KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + 1163 MAXPHYS, ("b_data + b_offset unmapped %p", bp)); 1164 } 1165 1166 static inline void 1167 vfs_buf_check_unmapped(struct buf *bp) 1168 { 1169 1170 KASSERT(bp->b_data == unmapped_buf, 1171 ("unmapped buf: corrupted b_data %p", bp)); 1172 } 1173 1174 #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) 1175 #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) 1176 #else 1177 #define BUF_CHECK_MAPPED(bp) do {} while (0) 1178 #define BUF_CHECK_UNMAPPED(bp) do {} while (0) 1179 #endif 1180 1181 static int 1182 isbufbusy(struct buf *bp) 1183 { 1184 if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || 1185 ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) 1186 return (1); 1187 return (0); 1188 } 1189 1190 /* 1191 * Shutdown the system cleanly to prepare for reboot, halt, or power off. 1192 */ 1193 void 1194 bufshutdown(int show_busybufs) 1195 { 1196 static int first_buf_printf = 1; 1197 struct buf *bp; 1198 int iter, nbusy, pbusy; 1199 #ifndef PREEMPTION 1200 int subiter; 1201 #endif 1202 1203 /* 1204 * Sync filesystems for shutdown 1205 */ 1206 wdog_kern_pat(WD_LASTVAL); 1207 sys_sync(curthread, NULL); 1208 1209 /* 1210 * With soft updates, some buffers that are 1211 * written will be remarked as dirty until other 1212 * buffers are written. 1213 */ 1214 for (iter = pbusy = 0; iter < 20; iter++) { 1215 nbusy = 0; 1216 for (bp = &buf[nbuf]; --bp >= buf; ) 1217 if (isbufbusy(bp)) 1218 nbusy++; 1219 if (nbusy == 0) { 1220 if (first_buf_printf) 1221 printf("All buffers synced."); 1222 break; 1223 } 1224 if (first_buf_printf) { 1225 printf("Syncing disks, buffers remaining... "); 1226 first_buf_printf = 0; 1227 } 1228 printf("%d ", nbusy); 1229 if (nbusy < pbusy) 1230 iter = 0; 1231 pbusy = nbusy; 1232 1233 wdog_kern_pat(WD_LASTVAL); 1234 sys_sync(curthread, NULL); 1235 1236 #ifdef PREEMPTION 1237 /* 1238 * Drop Giant and spin for a while to allow 1239 * interrupt threads to run. 1240 */ 1241 DROP_GIANT(); 1242 DELAY(50000 * iter); 1243 PICKUP_GIANT(); 1244 #else 1245 /* 1246 * Drop Giant and context switch several times to 1247 * allow interrupt threads to run. 1248 */ 1249 DROP_GIANT(); 1250 for (subiter = 0; subiter < 50 * iter; subiter++) { 1251 thread_lock(curthread); 1252 mi_switch(SW_VOL, NULL); 1253 thread_unlock(curthread); 1254 DELAY(1000); 1255 } 1256 PICKUP_GIANT(); 1257 #endif 1258 } 1259 printf("\n"); 1260 /* 1261 * Count only busy local buffers to prevent forcing 1262 * a fsck if we're just a client of a wedged NFS server 1263 */ 1264 nbusy = 0; 1265 for (bp = &buf[nbuf]; --bp >= buf; ) { 1266 if (isbufbusy(bp)) { 1267 #if 0 1268 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ 1269 if (bp->b_dev == NULL) { 1270 TAILQ_REMOVE(&mountlist, 1271 bp->b_vp->v_mount, mnt_list); 1272 continue; 1273 } 1274 #endif 1275 nbusy++; 1276 if (show_busybufs > 0) { 1277 printf( 1278 "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", 1279 nbusy, bp, bp->b_vp, bp->b_flags, 1280 (intmax_t)bp->b_blkno, 1281 (intmax_t)bp->b_lblkno); 1282 BUF_LOCKPRINTINFO(bp); 1283 if (show_busybufs > 1) 1284 vn_printf(bp->b_vp, 1285 "vnode content: "); 1286 } 1287 } 1288 } 1289 if (nbusy) { 1290 /* 1291 * Failed to sync all blocks. Indicate this and don't 1292 * unmount filesystems (thus forcing an fsck on reboot). 1293 */ 1294 printf("Giving up on %d buffers\n", nbusy); 1295 DELAY(5000000); /* 5 seconds */ 1296 } else { 1297 if (!first_buf_printf) 1298 printf("Final sync complete\n"); 1299 /* 1300 * Unmount filesystems 1301 */ 1302 if (panicstr == NULL) 1303 vfs_unmountall(); 1304 } 1305 swapoff_all(); 1306 DELAY(100000); /* wait for console output to finish */ 1307 } 1308 1309 static void 1310 bpmap_qenter(struct buf *bp) 1311 { 1312 1313 BUF_CHECK_MAPPED(bp); 1314 1315 /* 1316 * bp->b_data is relative to bp->b_offset, but 1317 * bp->b_offset may be offset into the first page. 1318 */ 1319 bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); 1320 pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); 1321 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 1322 (vm_offset_t)(bp->b_offset & PAGE_MASK)); 1323 } 1324 1325 /* 1326 * binsfree: 1327 * 1328 * Insert the buffer into the appropriate free list. 1329 */ 1330 static void 1331 binsfree(struct buf *bp, int qindex) 1332 { 1333 struct mtx *olock, *nlock; 1334 1335 if (qindex != QUEUE_EMPTY) { 1336 BUF_ASSERT_XLOCKED(bp); 1337 } 1338 1339 /* 1340 * Stick to the same clean queue for the lifetime of the buf to 1341 * limit locking below. Otherwise pick ont sequentially. 1342 */ 1343 if (qindex == QUEUE_CLEAN) { 1344 if (bqisclean(bp->b_qindex)) 1345 qindex = bp->b_qindex; 1346 else 1347 qindex = bqcleanq(); 1348 } 1349 1350 /* 1351 * Handle delayed bremfree() processing. 1352 */ 1353 nlock = bqlock(qindex); 1354 if (bp->b_flags & B_REMFREE) { 1355 olock = bqlock(bp->b_qindex); 1356 mtx_lock(olock); 1357 bremfreel(bp); 1358 if (olock != nlock) { 1359 mtx_unlock(olock); 1360 mtx_lock(nlock); 1361 } 1362 } else 1363 mtx_lock(nlock); 1364 1365 if (bp->b_qindex != QUEUE_NONE) 1366 panic("binsfree: free buffer onto another queue???"); 1367 1368 bp->b_qindex = qindex; 1369 if (bp->b_flags & B_AGE) 1370 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 1371 else 1372 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); 1373 #ifdef INVARIANTS 1374 bq_len[bp->b_qindex]++; 1375 #endif 1376 mtx_unlock(nlock); 1377 } 1378 1379 /* 1380 * buf_free: 1381 * 1382 * Free a buffer to the buf zone once it no longer has valid contents. 1383 */ 1384 static void 1385 buf_free(struct buf *bp) 1386 { 1387 1388 if (bp->b_flags & B_REMFREE) 1389 bremfreef(bp); 1390 if (bp->b_vflags & BV_BKGRDINPROG) 1391 panic("losing buffer 1"); 1392 if (bp->b_rcred != NOCRED) { 1393 crfree(bp->b_rcred); 1394 bp->b_rcred = NOCRED; 1395 } 1396 if (bp->b_wcred != NOCRED) { 1397 crfree(bp->b_wcred); 1398 bp->b_wcred = NOCRED; 1399 } 1400 if (!LIST_EMPTY(&bp->b_dep)) 1401 buf_deallocate(bp); 1402 bufkva_free(bp); 1403 BUF_UNLOCK(bp); 1404 uma_zfree(buf_zone, bp); 1405 atomic_add_int(&numfreebuffers, 1); 1406 bufspace_wakeup(); 1407 } 1408 1409 /* 1410 * buf_import: 1411 * 1412 * Import bufs into the uma cache from the buf list. The system still 1413 * expects a static array of bufs and much of the synchronization 1414 * around bufs assumes type stable storage. As a result, UMA is used 1415 * only as a per-cpu cache of bufs still maintained on a global list. 1416 */ 1417 static int 1418 buf_import(void *arg, void **store, int cnt, int flags) 1419 { 1420 struct buf *bp; 1421 int i; 1422 1423 mtx_lock(&bqlocks[QUEUE_EMPTY]); 1424 for (i = 0; i < cnt; i++) { 1425 bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); 1426 if (bp == NULL) 1427 break; 1428 bremfreel(bp); 1429 store[i] = bp; 1430 } 1431 mtx_unlock(&bqlocks[QUEUE_EMPTY]); 1432 1433 return (i); 1434 } 1435 1436 /* 1437 * buf_release: 1438 * 1439 * Release bufs from the uma cache back to the buffer queues. 1440 */ 1441 static void 1442 buf_release(void *arg, void **store, int cnt) 1443 { 1444 int i; 1445 1446 for (i = 0; i < cnt; i++) 1447 binsfree(store[i], QUEUE_EMPTY); 1448 } 1449 1450 /* 1451 * buf_alloc: 1452 * 1453 * Allocate an empty buffer header. 1454 */ 1455 static struct buf * 1456 buf_alloc(void) 1457 { 1458 struct buf *bp; 1459 1460 bp = uma_zalloc(buf_zone, M_NOWAIT); 1461 if (bp == NULL) { 1462 bufspace_daemonwakeup(); 1463 atomic_add_int(&numbufallocfails, 1); 1464 return (NULL); 1465 } 1466 1467 /* 1468 * Wake-up the bufspace daemon on transition. 1469 */ 1470 if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) 1471 bufspace_daemonwakeup(); 1472 1473 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 1474 panic("getnewbuf_empty: Locked buf %p on free queue.", bp); 1475 1476 KASSERT(bp->b_vp == NULL, 1477 ("bp: %p still has vnode %p.", bp, bp->b_vp)); 1478 KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, 1479 ("invalid buffer %p flags %#x", bp, bp->b_flags)); 1480 KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, 1481 ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); 1482 KASSERT(bp->b_npages == 0, 1483 ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); 1484 KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); 1485 KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); 1486 1487 bp->b_flags = 0; 1488 bp->b_ioflags = 0; 1489 bp->b_xflags = 0; 1490 bp->b_vflags = 0; 1491 bp->b_vp = NULL; 1492 bp->b_blkno = bp->b_lblkno = 0; 1493 bp->b_offset = NOOFFSET; 1494 bp->b_iodone = 0; 1495 bp->b_error = 0; 1496 bp->b_resid = 0; 1497 bp->b_bcount = 0; 1498 bp->b_npages = 0; 1499 bp->b_dirtyoff = bp->b_dirtyend = 0; 1500 bp->b_bufobj = NULL; 1501 bp->b_data = bp->b_kvabase = unmapped_buf; 1502 bp->b_fsprivate1 = NULL; 1503 bp->b_fsprivate2 = NULL; 1504 bp->b_fsprivate3 = NULL; 1505 LIST_INIT(&bp->b_dep); 1506 1507 return (bp); 1508 } 1509 1510 /* 1511 * buf_qrecycle: 1512 * 1513 * Free a buffer from the given bufqueue. kva controls whether the 1514 * freed buf must own some kva resources. This is used for 1515 * defragmenting. 1516 */ 1517 static int 1518 buf_qrecycle(int qindex, bool kva) 1519 { 1520 struct buf *bp, *nbp; 1521 1522 if (kva) 1523 atomic_add_int(&bufdefragcnt, 1); 1524 nbp = NULL; 1525 mtx_lock(&bqlocks[qindex]); 1526 nbp = TAILQ_FIRST(&bufqueues[qindex]); 1527 1528 /* 1529 * Run scan, possibly freeing data and/or kva mappings on the fly 1530 * depending. 1531 */ 1532 while ((bp = nbp) != NULL) { 1533 /* 1534 * Calculate next bp (we can only use it if we do not 1535 * release the bqlock). 1536 */ 1537 nbp = TAILQ_NEXT(bp, b_freelist); 1538 1539 /* 1540 * If we are defragging then we need a buffer with 1541 * some kva to reclaim. 1542 */ 1543 if (kva && bp->b_kvasize == 0) 1544 continue; 1545 1546 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 1547 continue; 1548 1549 /* 1550 * Skip buffers with background writes in progress. 1551 */ 1552 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 1553 BUF_UNLOCK(bp); 1554 continue; 1555 } 1556 1557 KASSERT(bp->b_qindex == qindex, 1558 ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); 1559 /* 1560 * NOTE: nbp is now entirely invalid. We can only restart 1561 * the scan from this point on. 1562 */ 1563 bremfreel(bp); 1564 mtx_unlock(&bqlocks[qindex]); 1565 1566 /* 1567 * Requeue the background write buffer with error and 1568 * restart the scan. 1569 */ 1570 if ((bp->b_vflags & BV_BKGRDERR) != 0) { 1571 bqrelse(bp); 1572 mtx_lock(&bqlocks[qindex]); 1573 nbp = TAILQ_FIRST(&bufqueues[qindex]); 1574 continue; 1575 } 1576 bp->b_flags |= B_INVAL; 1577 brelse(bp); 1578 return (0); 1579 } 1580 mtx_unlock(&bqlocks[qindex]); 1581 1582 return (ENOBUFS); 1583 } 1584 1585 /* 1586 * buf_recycle: 1587 * 1588 * Iterate through all clean queues until we find a buf to recycle or 1589 * exhaust the search. 1590 */ 1591 static int 1592 buf_recycle(bool kva) 1593 { 1594 int qindex, first_qindex; 1595 1596 qindex = first_qindex = bqcleanq(); 1597 do { 1598 if (buf_qrecycle(qindex, kva) == 0) 1599 return (0); 1600 if (++qindex == QUEUE_CLEAN + clean_queues) 1601 qindex = QUEUE_CLEAN; 1602 } while (qindex != first_qindex); 1603 1604 return (ENOBUFS); 1605 } 1606 1607 /* 1608 * buf_scan: 1609 * 1610 * Scan the clean queues looking for a buffer to recycle. needsbuffer 1611 * is set on failure so that the caller may optionally bufspace_wait() 1612 * in a race-free fashion. 1613 */ 1614 static int 1615 buf_scan(bool defrag) 1616 { 1617 int error; 1618 1619 /* 1620 * To avoid heavy synchronization and wakeup races we set 1621 * needsbuffer and re-poll before failing. This ensures that 1622 * no frees can be missed between an unsuccessful poll and 1623 * going to sleep in a synchronized fashion. 1624 */ 1625 if ((error = buf_recycle(defrag)) != 0) { 1626 atomic_set_int(&needsbuffer, 1); 1627 bufspace_daemonwakeup(); 1628 error = buf_recycle(defrag); 1629 } 1630 if (error == 0) 1631 atomic_add_int(&getnewbufrestarts, 1); 1632 return (error); 1633 } 1634 1635 /* 1636 * bremfree: 1637 * 1638 * Mark the buffer for removal from the appropriate free list. 1639 * 1640 */ 1641 void 1642 bremfree(struct buf *bp) 1643 { 1644 1645 CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1646 KASSERT((bp->b_flags & B_REMFREE) == 0, 1647 ("bremfree: buffer %p already marked for delayed removal.", bp)); 1648 KASSERT(bp->b_qindex != QUEUE_NONE, 1649 ("bremfree: buffer %p not on a queue.", bp)); 1650 BUF_ASSERT_XLOCKED(bp); 1651 1652 bp->b_flags |= B_REMFREE; 1653 } 1654 1655 /* 1656 * bremfreef: 1657 * 1658 * Force an immediate removal from a free list. Used only in nfs when 1659 * it abuses the b_freelist pointer. 1660 */ 1661 void 1662 bremfreef(struct buf *bp) 1663 { 1664 struct mtx *qlock; 1665 1666 qlock = bqlock(bp->b_qindex); 1667 mtx_lock(qlock); 1668 bremfreel(bp); 1669 mtx_unlock(qlock); 1670 } 1671 1672 /* 1673 * bremfreel: 1674 * 1675 * Removes a buffer from the free list, must be called with the 1676 * correct qlock held. 1677 */ 1678 static void 1679 bremfreel(struct buf *bp) 1680 { 1681 1682 CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", 1683 bp, bp->b_vp, bp->b_flags); 1684 KASSERT(bp->b_qindex != QUEUE_NONE, 1685 ("bremfreel: buffer %p not on a queue.", bp)); 1686 if (bp->b_qindex != QUEUE_EMPTY) { 1687 BUF_ASSERT_XLOCKED(bp); 1688 } 1689 mtx_assert(bqlock(bp->b_qindex), MA_OWNED); 1690 1691 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 1692 #ifdef INVARIANTS 1693 KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", 1694 bp->b_qindex)); 1695 bq_len[bp->b_qindex]--; 1696 #endif 1697 bp->b_qindex = QUEUE_NONE; 1698 bp->b_flags &= ~B_REMFREE; 1699 } 1700 1701 /* 1702 * bufkva_free: 1703 * 1704 * Free the kva allocation for a buffer. 1705 * 1706 */ 1707 static void 1708 bufkva_free(struct buf *bp) 1709 { 1710 1711 #ifdef INVARIANTS 1712 if (bp->b_kvasize == 0) { 1713 KASSERT(bp->b_kvabase == unmapped_buf && 1714 bp->b_data == unmapped_buf, 1715 ("Leaked KVA space on %p", bp)); 1716 } else if (buf_mapped(bp)) 1717 BUF_CHECK_MAPPED(bp); 1718 else 1719 BUF_CHECK_UNMAPPED(bp); 1720 #endif 1721 if (bp->b_kvasize == 0) 1722 return; 1723 1724 vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); 1725 atomic_subtract_long(&bufkvaspace, bp->b_kvasize); 1726 atomic_add_int(&buffreekvacnt, 1); 1727 bp->b_data = bp->b_kvabase = unmapped_buf; 1728 bp->b_kvasize = 0; 1729 } 1730 1731 /* 1732 * bufkva_alloc: 1733 * 1734 * Allocate the buffer KVA and set b_kvasize and b_kvabase. 1735 */ 1736 static int 1737 bufkva_alloc(struct buf *bp, int maxsize, int gbflags) 1738 { 1739 vm_offset_t addr; 1740 int error; 1741 1742 KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, 1743 ("Invalid gbflags 0x%x in %s", gbflags, __func__)); 1744 1745 bufkva_free(bp); 1746 1747 addr = 0; 1748 error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); 1749 if (error != 0) { 1750 /* 1751 * Buffer map is too fragmented. Request the caller 1752 * to defragment the map. 1753 */ 1754 return (error); 1755 } 1756 bp->b_kvabase = (caddr_t)addr; 1757 bp->b_kvasize = maxsize; 1758 atomic_add_long(&bufkvaspace, bp->b_kvasize); 1759 if ((gbflags & GB_UNMAPPED) != 0) { 1760 bp->b_data = unmapped_buf; 1761 BUF_CHECK_UNMAPPED(bp); 1762 } else { 1763 bp->b_data = bp->b_kvabase; 1764 BUF_CHECK_MAPPED(bp); 1765 } 1766 return (0); 1767 } 1768 1769 /* 1770 * bufkva_reclaim: 1771 * 1772 * Reclaim buffer kva by freeing buffers holding kva. This is a vmem 1773 * callback that fires to avoid returning failure. 1774 */ 1775 static void 1776 bufkva_reclaim(vmem_t *vmem, int flags) 1777 { 1778 int i; 1779 1780 for (i = 0; i < 5; i++) 1781 if (buf_scan(true) != 0) 1782 break; 1783 return; 1784 } 1785 1786 1787 /* 1788 * Attempt to initiate asynchronous I/O on read-ahead blocks. We must 1789 * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, 1790 * the buffer is valid and we do not have to do anything. 1791 */ 1792 void 1793 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, 1794 int cnt, struct ucred * cred) 1795 { 1796 struct buf *rabp; 1797 int i; 1798 1799 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 1800 if (inmem(vp, *rablkno)) 1801 continue; 1802 rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); 1803 1804 if ((rabp->b_flags & B_CACHE) == 0) { 1805 if (!TD_IS_IDLETHREAD(curthread)) { 1806 #ifdef RACCT 1807 if (racct_enable) { 1808 PROC_LOCK(curproc); 1809 racct_add_buf(curproc, rabp, 0); 1810 PROC_UNLOCK(curproc); 1811 } 1812 #endif /* RACCT */ 1813 curthread->td_ru.ru_inblock++; 1814 } 1815 rabp->b_flags |= B_ASYNC; 1816 rabp->b_flags &= ~B_INVAL; 1817 rabp->b_ioflags &= ~BIO_ERROR; 1818 rabp->b_iocmd = BIO_READ; 1819 if (rabp->b_rcred == NOCRED && cred != NOCRED) 1820 rabp->b_rcred = crhold(cred); 1821 vfs_busy_pages(rabp, 0); 1822 BUF_KERNPROC(rabp); 1823 rabp->b_iooffset = dbtob(rabp->b_blkno); 1824 bstrategy(rabp); 1825 } else { 1826 brelse(rabp); 1827 } 1828 } 1829 } 1830 1831 /* 1832 * Entry point for bread() and breadn() via #defines in sys/buf.h. 1833 * 1834 * Get a buffer with the specified data. Look in the cache first. We 1835 * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE 1836 * is set, the buffer is valid and we do not have to do anything, see 1837 * getblk(). Also starts asynchronous I/O on read-ahead blocks. 1838 * 1839 * Always return a NULL buffer pointer (in bpp) when returning an error. 1840 */ 1841 int 1842 breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, 1843 int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) 1844 { 1845 struct buf *bp; 1846 int rv = 0, readwait = 0; 1847 1848 CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); 1849 /* 1850 * Can only return NULL if GB_LOCK_NOWAIT flag is specified. 1851 */ 1852 *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); 1853 if (bp == NULL) 1854 return (EBUSY); 1855 1856 /* if not found in cache, do some I/O */ 1857 if ((bp->b_flags & B_CACHE) == 0) { 1858 if (!TD_IS_IDLETHREAD(curthread)) { 1859 #ifdef RACCT 1860 if (racct_enable) { 1861 PROC_LOCK(curproc); 1862 racct_add_buf(curproc, bp, 0); 1863 PROC_UNLOCK(curproc); 1864 } 1865 #endif /* RACCT */ 1866 curthread->td_ru.ru_inblock++; 1867 } 1868 bp->b_iocmd = BIO_READ; 1869 bp->b_flags &= ~B_INVAL; 1870 bp->b_ioflags &= ~BIO_ERROR; 1871 if (bp->b_rcred == NOCRED && cred != NOCRED) 1872 bp->b_rcred = crhold(cred); 1873 vfs_busy_pages(bp, 0); 1874 bp->b_iooffset = dbtob(bp->b_blkno); 1875 bstrategy(bp); 1876 ++readwait; 1877 } 1878 1879 breada(vp, rablkno, rabsize, cnt, cred); 1880 1881 if (readwait) { 1882 rv = bufwait(bp); 1883 if (rv != 0) { 1884 brelse(bp); 1885 *bpp = NULL; 1886 } 1887 } 1888 return (rv); 1889 } 1890 1891 /* 1892 * Write, release buffer on completion. (Done by iodone 1893 * if async). Do not bother writing anything if the buffer 1894 * is invalid. 1895 * 1896 * Note that we set B_CACHE here, indicating that buffer is 1897 * fully valid and thus cacheable. This is true even of NFS 1898 * now so we set it generally. This could be set either here 1899 * or in biodone() since the I/O is synchronous. We put it 1900 * here. 1901 */ 1902 int 1903 bufwrite(struct buf *bp) 1904 { 1905 int oldflags; 1906 struct vnode *vp; 1907 long space; 1908 int vp_md; 1909 1910 CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1911 if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { 1912 bp->b_flags |= B_INVAL | B_RELBUF; 1913 bp->b_flags &= ~B_CACHE; 1914 brelse(bp); 1915 return (ENXIO); 1916 } 1917 if (bp->b_flags & B_INVAL) { 1918 brelse(bp); 1919 return (0); 1920 } 1921 1922 if (bp->b_flags & B_BARRIER) 1923 barrierwrites++; 1924 1925 oldflags = bp->b_flags; 1926 1927 BUF_ASSERT_HELD(bp); 1928 1929 KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), 1930 ("FFS background buffer should not get here %p", bp)); 1931 1932 vp = bp->b_vp; 1933 if (vp) 1934 vp_md = vp->v_vflag & VV_MD; 1935 else 1936 vp_md = 0; 1937 1938 /* 1939 * Mark the buffer clean. Increment the bufobj write count 1940 * before bundirty() call, to prevent other thread from seeing 1941 * empty dirty list and zero counter for writes in progress, 1942 * falsely indicating that the bufobj is clean. 1943 */ 1944 bufobj_wref(bp->b_bufobj); 1945 bundirty(bp); 1946 1947 bp->b_flags &= ~B_DONE; 1948 bp->b_ioflags &= ~BIO_ERROR; 1949 bp->b_flags |= B_CACHE; 1950 bp->b_iocmd = BIO_WRITE; 1951 1952 vfs_busy_pages(bp, 1); 1953 1954 /* 1955 * Normal bwrites pipeline writes 1956 */ 1957 bp->b_runningbufspace = bp->b_bufsize; 1958 space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); 1959 1960 if (!TD_IS_IDLETHREAD(curthread)) { 1961 #ifdef RACCT 1962 if (racct_enable) { 1963 PROC_LOCK(curproc); 1964 racct_add_buf(curproc, bp, 1); 1965 PROC_UNLOCK(curproc); 1966 } 1967 #endif /* RACCT */ 1968 curthread->td_ru.ru_oublock++; 1969 } 1970 if (oldflags & B_ASYNC) 1971 BUF_KERNPROC(bp); 1972 bp->b_iooffset = dbtob(bp->b_blkno); 1973 buf_track(bp, __func__); 1974 bstrategy(bp); 1975 1976 if ((oldflags & B_ASYNC) == 0) { 1977 int rtval = bufwait(bp); 1978 brelse(bp); 1979 return (rtval); 1980 } else if (space > hirunningspace) { 1981 /* 1982 * don't allow the async write to saturate the I/O 1983 * system. We will not deadlock here because 1984 * we are blocking waiting for I/O that is already in-progress 1985 * to complete. We do not block here if it is the update 1986 * or syncer daemon trying to clean up as that can lead 1987 * to deadlock. 1988 */ 1989 if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) 1990 waitrunningbufspace(); 1991 } 1992 1993 return (0); 1994 } 1995 1996 void 1997 bufbdflush(struct bufobj *bo, struct buf *bp) 1998 { 1999 struct buf *nbp; 2000 2001 if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { 2002 (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); 2003 altbufferflushes++; 2004 } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { 2005 BO_LOCK(bo); 2006 /* 2007 * Try to find a buffer to flush. 2008 */ 2009 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { 2010 if ((nbp->b_vflags & BV_BKGRDINPROG) || 2011 BUF_LOCK(nbp, 2012 LK_EXCLUSIVE | LK_NOWAIT, NULL)) 2013 continue; 2014 if (bp == nbp) 2015 panic("bdwrite: found ourselves"); 2016 BO_UNLOCK(bo); 2017 /* Don't countdeps with the bo lock held. */ 2018 if (buf_countdeps(nbp, 0)) { 2019 BO_LOCK(bo); 2020 BUF_UNLOCK(nbp); 2021 continue; 2022 } 2023 if (nbp->b_flags & B_CLUSTEROK) { 2024 vfs_bio_awrite(nbp); 2025 } else { 2026 bremfree(nbp); 2027 bawrite(nbp); 2028 } 2029 dirtybufferflushes++; 2030 break; 2031 } 2032 if (nbp == NULL) 2033 BO_UNLOCK(bo); 2034 } 2035 } 2036 2037 /* 2038 * Delayed write. (Buffer is marked dirty). Do not bother writing 2039 * anything if the buffer is marked invalid. 2040 * 2041 * Note that since the buffer must be completely valid, we can safely 2042 * set B_CACHE. In fact, we have to set B_CACHE here rather then in 2043 * biodone() in order to prevent getblk from writing the buffer 2044 * out synchronously. 2045 */ 2046 void 2047 bdwrite(struct buf *bp) 2048 { 2049 struct thread *td = curthread; 2050 struct vnode *vp; 2051 struct bufobj *bo; 2052 2053 CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2054 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2055 KASSERT((bp->b_flags & B_BARRIER) == 0, 2056 ("Barrier request in delayed write %p", bp)); 2057 BUF_ASSERT_HELD(bp); 2058 2059 if (bp->b_flags & B_INVAL) { 2060 brelse(bp); 2061 return; 2062 } 2063 2064 /* 2065 * If we have too many dirty buffers, don't create any more. 2066 * If we are wildly over our limit, then force a complete 2067 * cleanup. Otherwise, just keep the situation from getting 2068 * out of control. Note that we have to avoid a recursive 2069 * disaster and not try to clean up after our own cleanup! 2070 */ 2071 vp = bp->b_vp; 2072 bo = bp->b_bufobj; 2073 if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { 2074 td->td_pflags |= TDP_INBDFLUSH; 2075 BO_BDFLUSH(bo, bp); 2076 td->td_pflags &= ~TDP_INBDFLUSH; 2077 } else 2078 recursiveflushes++; 2079 2080 bdirty(bp); 2081 /* 2082 * Set B_CACHE, indicating that the buffer is fully valid. This is 2083 * true even of NFS now. 2084 */ 2085 bp->b_flags |= B_CACHE; 2086 2087 /* 2088 * This bmap keeps the system from needing to do the bmap later, 2089 * perhaps when the system is attempting to do a sync. Since it 2090 * is likely that the indirect block -- or whatever other datastructure 2091 * that the filesystem needs is still in memory now, it is a good 2092 * thing to do this. Note also, that if the pageout daemon is 2093 * requesting a sync -- there might not be enough memory to do 2094 * the bmap then... So, this is important to do. 2095 */ 2096 if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { 2097 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 2098 } 2099 2100 buf_track(bp, __func__); 2101 2102 /* 2103 * Set the *dirty* buffer range based upon the VM system dirty 2104 * pages. 2105 * 2106 * Mark the buffer pages as clean. We need to do this here to 2107 * satisfy the vnode_pager and the pageout daemon, so that it 2108 * thinks that the pages have been "cleaned". Note that since 2109 * the pages are in a delayed write buffer -- the VFS layer 2110 * "will" see that the pages get written out on the next sync, 2111 * or perhaps the cluster will be completed. 2112 */ 2113 vfs_clean_pages_dirty_buf(bp); 2114 bqrelse(bp); 2115 2116 /* 2117 * note: we cannot initiate I/O from a bdwrite even if we wanted to, 2118 * due to the softdep code. 2119 */ 2120 } 2121 2122 /* 2123 * bdirty: 2124 * 2125 * Turn buffer into delayed write request. We must clear BIO_READ and 2126 * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to 2127 * itself to properly update it in the dirty/clean lists. We mark it 2128 * B_DONE to ensure that any asynchronization of the buffer properly 2129 * clears B_DONE ( else a panic will occur later ). 2130 * 2131 * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which 2132 * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() 2133 * should only be called if the buffer is known-good. 2134 * 2135 * Since the buffer is not on a queue, we do not update the numfreebuffers 2136 * count. 2137 * 2138 * The buffer must be on QUEUE_NONE. 2139 */ 2140 void 2141 bdirty(struct buf *bp) 2142 { 2143 2144 CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", 2145 bp, bp->b_vp, bp->b_flags); 2146 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2147 KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, 2148 ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 2149 BUF_ASSERT_HELD(bp); 2150 bp->b_flags &= ~(B_RELBUF); 2151 bp->b_iocmd = BIO_WRITE; 2152 2153 if ((bp->b_flags & B_DELWRI) == 0) { 2154 bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; 2155 reassignbuf(bp); 2156 bdirtyadd(); 2157 } 2158 } 2159 2160 /* 2161 * bundirty: 2162 * 2163 * Clear B_DELWRI for buffer. 2164 * 2165 * Since the buffer is not on a queue, we do not update the numfreebuffers 2166 * count. 2167 * 2168 * The buffer must be on QUEUE_NONE. 2169 */ 2170 2171 void 2172 bundirty(struct buf *bp) 2173 { 2174 2175 CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2176 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2177 KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, 2178 ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); 2179 BUF_ASSERT_HELD(bp); 2180 2181 if (bp->b_flags & B_DELWRI) { 2182 bp->b_flags &= ~B_DELWRI; 2183 reassignbuf(bp); 2184 bdirtysub(); 2185 } 2186 /* 2187 * Since it is now being written, we can clear its deferred write flag. 2188 */ 2189 bp->b_flags &= ~B_DEFERRED; 2190 } 2191 2192 /* 2193 * bawrite: 2194 * 2195 * Asynchronous write. Start output on a buffer, but do not wait for 2196 * it to complete. The buffer is released when the output completes. 2197 * 2198 * bwrite() ( or the VOP routine anyway ) is responsible for handling 2199 * B_INVAL buffers. Not us. 2200 */ 2201 void 2202 bawrite(struct buf *bp) 2203 { 2204 2205 bp->b_flags |= B_ASYNC; 2206 (void) bwrite(bp); 2207 } 2208 2209 /* 2210 * babarrierwrite: 2211 * 2212 * Asynchronous barrier write. Start output on a buffer, but do not 2213 * wait for it to complete. Place a write barrier after this write so 2214 * that this buffer and all buffers written before it are committed to 2215 * the disk before any buffers written after this write are committed 2216 * to the disk. The buffer is released when the output completes. 2217 */ 2218 void 2219 babarrierwrite(struct buf *bp) 2220 { 2221 2222 bp->b_flags |= B_ASYNC | B_BARRIER; 2223 (void) bwrite(bp); 2224 } 2225 2226 /* 2227 * bbarrierwrite: 2228 * 2229 * Synchronous barrier write. Start output on a buffer and wait for 2230 * it to complete. Place a write barrier after this write so that 2231 * this buffer and all buffers written before it are committed to 2232 * the disk before any buffers written after this write are committed 2233 * to the disk. The buffer is released when the output completes. 2234 */ 2235 int 2236 bbarrierwrite(struct buf *bp) 2237 { 2238 2239 bp->b_flags |= B_BARRIER; 2240 return (bwrite(bp)); 2241 } 2242 2243 /* 2244 * bwillwrite: 2245 * 2246 * Called prior to the locking of any vnodes when we are expecting to 2247 * write. We do not want to starve the buffer cache with too many 2248 * dirty buffers so we block here. By blocking prior to the locking 2249 * of any vnodes we attempt to avoid the situation where a locked vnode 2250 * prevents the various system daemons from flushing related buffers. 2251 */ 2252 void 2253 bwillwrite(void) 2254 { 2255 2256 if (numdirtybuffers >= hidirtybuffers) { 2257 mtx_lock(&bdirtylock); 2258 while (numdirtybuffers >= hidirtybuffers) { 2259 bdirtywait = 1; 2260 msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), 2261 "flswai", 0); 2262 } 2263 mtx_unlock(&bdirtylock); 2264 } 2265 } 2266 2267 /* 2268 * Return true if we have too many dirty buffers. 2269 */ 2270 int 2271 buf_dirty_count_severe(void) 2272 { 2273 2274 return(numdirtybuffers >= hidirtybuffers); 2275 } 2276 2277 /* 2278 * brelse: 2279 * 2280 * Release a busy buffer and, if requested, free its resources. The 2281 * buffer will be stashed in the appropriate bufqueue[] allowing it 2282 * to be accessed later as a cache entity or reused for other purposes. 2283 */ 2284 void 2285 brelse(struct buf *bp) 2286 { 2287 int qindex; 2288 2289 /* 2290 * Many functions erroneously call brelse with a NULL bp under rare 2291 * error conditions. Simply return when called with a NULL bp. 2292 */ 2293 if (bp == NULL) 2294 return; 2295 CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", 2296 bp, bp->b_vp, bp->b_flags); 2297 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 2298 ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 2299 KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, 2300 ("brelse: non-VMIO buffer marked NOREUSE")); 2301 2302 if (BUF_LOCKRECURSED(bp)) { 2303 /* 2304 * Do not process, in particular, do not handle the 2305 * B_INVAL/B_RELBUF and do not release to free list. 2306 */ 2307 BUF_UNLOCK(bp); 2308 return; 2309 } 2310 2311 if (bp->b_flags & B_MANAGED) { 2312 bqrelse(bp); 2313 return; 2314 } 2315 2316 if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { 2317 BO_LOCK(bp->b_bufobj); 2318 bp->b_vflags &= ~BV_BKGRDERR; 2319 BO_UNLOCK(bp->b_bufobj); 2320 bdirty(bp); 2321 } 2322 if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && 2323 (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && 2324 !(bp->b_flags & B_INVAL)) { 2325 /* 2326 * Failed write, redirty. All errors except ENXIO (which 2327 * means the device is gone) are expected to be potentially 2328 * transient - underlying media might work if tried again 2329 * after EIO, and memory might be available after an ENOMEM. 2330 * 2331 * Do this also for buffers that failed with ENXIO, but have 2332 * non-empty dependencies - the soft updates code might need 2333 * to access the buffer to untangle them. 2334 * 2335 * Must clear BIO_ERROR to prevent pages from being scrapped. 2336 */ 2337 bp->b_ioflags &= ~BIO_ERROR; 2338 bdirty(bp); 2339 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || 2340 (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { 2341 /* 2342 * Either a failed read I/O, or we were asked to free or not 2343 * cache the buffer, or we failed to write to a device that's 2344 * no longer present. 2345 */ 2346 bp->b_flags |= B_INVAL; 2347 if (!LIST_EMPTY(&bp->b_dep)) 2348 buf_deallocate(bp); 2349 if (bp->b_flags & B_DELWRI) 2350 bdirtysub(); 2351 bp->b_flags &= ~(B_DELWRI | B_CACHE); 2352 if ((bp->b_flags & B_VMIO) == 0) { 2353 allocbuf(bp, 0); 2354 if (bp->b_vp) 2355 brelvp(bp); 2356 } 2357 } 2358 2359 /* 2360 * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() 2361 * is called with B_DELWRI set, the underlying pages may wind up 2362 * getting freed causing a previous write (bdwrite()) to get 'lost' 2363 * because pages associated with a B_DELWRI bp are marked clean. 2364 * 2365 * We still allow the B_INVAL case to call vfs_vmio_truncate(), even 2366 * if B_DELWRI is set. 2367 */ 2368 if (bp->b_flags & B_DELWRI) 2369 bp->b_flags &= ~B_RELBUF; 2370 2371 /* 2372 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 2373 * constituted, not even NFS buffers now. Two flags effect this. If 2374 * B_INVAL, the struct buf is invalidated but the VM object is kept 2375 * around ( i.e. so it is trivial to reconstitute the buffer later ). 2376 * 2377 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be 2378 * invalidated. BIO_ERROR cannot be set for a failed write unless the 2379 * buffer is also B_INVAL because it hits the re-dirtying code above. 2380 * 2381 * Normally we can do this whether a buffer is B_DELWRI or not. If 2382 * the buffer is an NFS buffer, it is tracking piecemeal writes or 2383 * the commit state and we cannot afford to lose the buffer. If the 2384 * buffer has a background write in progress, we need to keep it 2385 * around to prevent it from being reconstituted and starting a second 2386 * background write. 2387 */ 2388 if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || 2389 (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && 2390 !(bp->b_vp->v_mount != NULL && 2391 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && 2392 !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) { 2393 vfs_vmio_invalidate(bp); 2394 allocbuf(bp, 0); 2395 } 2396 2397 if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || 2398 (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { 2399 allocbuf(bp, 0); 2400 bp->b_flags &= ~B_NOREUSE; 2401 if (bp->b_vp != NULL) 2402 brelvp(bp); 2403 } 2404 2405 /* 2406 * If the buffer has junk contents signal it and eventually 2407 * clean up B_DELWRI and diassociate the vnode so that gbincore() 2408 * doesn't find it. 2409 */ 2410 if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || 2411 (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) 2412 bp->b_flags |= B_INVAL; 2413 if (bp->b_flags & B_INVAL) { 2414 if (bp->b_flags & B_DELWRI) 2415 bundirty(bp); 2416 if (bp->b_vp) 2417 brelvp(bp); 2418 } 2419 2420 buf_track(bp, __func__); 2421 2422 /* buffers with no memory */ 2423 if (bp->b_bufsize == 0) { 2424 buf_free(bp); 2425 return; 2426 } 2427 /* buffers with junk contents */ 2428 if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || 2429 (bp->b_ioflags & BIO_ERROR)) { 2430 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); 2431 if (bp->b_vflags & BV_BKGRDINPROG) 2432 panic("losing buffer 2"); 2433 qindex = QUEUE_CLEAN; 2434 bp->b_flags |= B_AGE; 2435 /* remaining buffers */ 2436 } else if (bp->b_flags & B_DELWRI) 2437 qindex = QUEUE_DIRTY; 2438 else 2439 qindex = QUEUE_CLEAN; 2440 2441 binsfree(bp, qindex); 2442 2443 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); 2444 if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) 2445 panic("brelse: not dirty"); 2446 /* unlock */ 2447 BUF_UNLOCK(bp); 2448 if (qindex == QUEUE_CLEAN) 2449 bufspace_wakeup(); 2450 } 2451 2452 /* 2453 * Release a buffer back to the appropriate queue but do not try to free 2454 * it. The buffer is expected to be used again soon. 2455 * 2456 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 2457 * biodone() to requeue an async I/O on completion. It is also used when 2458 * known good buffers need to be requeued but we think we may need the data 2459 * again soon. 2460 * 2461 * XXX we should be able to leave the B_RELBUF hint set on completion. 2462 */ 2463 void 2464 bqrelse(struct buf *bp) 2465 { 2466 int qindex; 2467 2468 CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2469 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 2470 ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 2471 2472 qindex = QUEUE_NONE; 2473 if (BUF_LOCKRECURSED(bp)) { 2474 /* do not release to free list */ 2475 BUF_UNLOCK(bp); 2476 return; 2477 } 2478 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 2479 2480 if (bp->b_flags & B_MANAGED) { 2481 if (bp->b_flags & B_REMFREE) 2482 bremfreef(bp); 2483 goto out; 2484 } 2485 2486 /* buffers with stale but valid contents */ 2487 if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | 2488 BV_BKGRDERR)) == BV_BKGRDERR) { 2489 BO_LOCK(bp->b_bufobj); 2490 bp->b_vflags &= ~BV_BKGRDERR; 2491 BO_UNLOCK(bp->b_bufobj); 2492 qindex = QUEUE_DIRTY; 2493 } else { 2494 if ((bp->b_flags & B_DELWRI) == 0 && 2495 (bp->b_xflags & BX_VNDIRTY)) 2496 panic("bqrelse: not dirty"); 2497 if ((bp->b_flags & B_NOREUSE) != 0) { 2498 brelse(bp); 2499 return; 2500 } 2501 qindex = QUEUE_CLEAN; 2502 } 2503 binsfree(bp, qindex); 2504 2505 out: 2506 buf_track(bp, __func__); 2507 /* unlock */ 2508 BUF_UNLOCK(bp); 2509 if (qindex == QUEUE_CLEAN) 2510 bufspace_wakeup(); 2511 } 2512 2513 /* 2514 * Complete I/O to a VMIO backed page. Validate the pages as appropriate, 2515 * restore bogus pages. 2516 */ 2517 static void 2518 vfs_vmio_iodone(struct buf *bp) 2519 { 2520 vm_ooffset_t foff; 2521 vm_page_t m; 2522 vm_object_t obj; 2523 struct vnode *vp; 2524 int i, iosize, resid; 2525 bool bogus; 2526 2527 obj = bp->b_bufobj->bo_object; 2528 KASSERT(obj->paging_in_progress >= bp->b_npages, 2529 ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", 2530 obj->paging_in_progress, bp->b_npages)); 2531 2532 vp = bp->b_vp; 2533 KASSERT(vp->v_holdcnt > 0, 2534 ("vfs_vmio_iodone: vnode %p has zero hold count", vp)); 2535 KASSERT(vp->v_object != NULL, 2536 ("vfs_vmio_iodone: vnode %p has no vm_object", vp)); 2537 2538 foff = bp->b_offset; 2539 KASSERT(bp->b_offset != NOOFFSET, 2540 ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); 2541 2542 bogus = false; 2543 iosize = bp->b_bcount - bp->b_resid; 2544 VM_OBJECT_WLOCK(obj); 2545 for (i = 0; i < bp->b_npages; i++) { 2546 resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; 2547 if (resid > iosize) 2548 resid = iosize; 2549 2550 /* 2551 * cleanup bogus pages, restoring the originals 2552 */ 2553 m = bp->b_pages[i]; 2554 if (m == bogus_page) { 2555 bogus = true; 2556 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 2557 if (m == NULL) 2558 panic("biodone: page disappeared!"); 2559 bp->b_pages[i] = m; 2560 } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { 2561 /* 2562 * In the write case, the valid and clean bits are 2563 * already changed correctly ( see bdwrite() ), so we 2564 * only need to do this here in the read case. 2565 */ 2566 KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, 2567 resid)) == 0, ("vfs_vmio_iodone: page %p " 2568 "has unexpected dirty bits", m)); 2569 vfs_page_set_valid(bp, foff, m); 2570 } 2571 KASSERT(OFF_TO_IDX(foff) == m->pindex, 2572 ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", 2573 (intmax_t)foff, (uintmax_t)m->pindex)); 2574 2575 vm_page_sunbusy(m); 2576 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 2577 iosize -= resid; 2578 } 2579 vm_object_pip_wakeupn(obj, bp->b_npages); 2580 VM_OBJECT_WUNLOCK(obj); 2581 if (bogus && buf_mapped(bp)) { 2582 BUF_CHECK_MAPPED(bp); 2583 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 2584 bp->b_pages, bp->b_npages); 2585 } 2586 } 2587 2588 /* 2589 * Unwire a page held by a buf and place it on the appropriate vm queue. 2590 */ 2591 static void 2592 vfs_vmio_unwire(struct buf *bp, vm_page_t m) 2593 { 2594 bool freed; 2595 2596 vm_page_lock(m); 2597 if (vm_page_unwire(m, PQ_NONE)) { 2598 /* 2599 * Determine if the page should be freed before adding 2600 * it to the inactive queue. 2601 */ 2602 if (m->valid == 0) { 2603 freed = !vm_page_busied(m); 2604 if (freed) 2605 vm_page_free(m); 2606 } else if ((bp->b_flags & B_DIRECT) != 0) 2607 freed = vm_page_try_to_free(m); 2608 else 2609 freed = false; 2610 if (!freed) { 2611 /* 2612 * If the page is unlikely to be reused, let the 2613 * VM know. Otherwise, maintain LRU page 2614 * ordering and put the page at the tail of the 2615 * inactive queue. 2616 */ 2617 if ((bp->b_flags & B_NOREUSE) != 0) 2618 vm_page_deactivate_noreuse(m); 2619 else 2620 vm_page_deactivate(m); 2621 } 2622 } 2623 vm_page_unlock(m); 2624 } 2625 2626 /* 2627 * Perform page invalidation when a buffer is released. The fully invalid 2628 * pages will be reclaimed later in vfs_vmio_truncate(). 2629 */ 2630 static void 2631 vfs_vmio_invalidate(struct buf *bp) 2632 { 2633 vm_object_t obj; 2634 vm_page_t m; 2635 int i, resid, poffset, presid; 2636 2637 if (buf_mapped(bp)) { 2638 BUF_CHECK_MAPPED(bp); 2639 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); 2640 } else 2641 BUF_CHECK_UNMAPPED(bp); 2642 /* 2643 * Get the base offset and length of the buffer. Note that 2644 * in the VMIO case if the buffer block size is not 2645 * page-aligned then b_data pointer may not be page-aligned. 2646 * But our b_pages[] array *IS* page aligned. 2647 * 2648 * block sizes less then DEV_BSIZE (usually 512) are not 2649 * supported due to the page granularity bits (m->valid, 2650 * m->dirty, etc...). 2651 * 2652 * See man buf(9) for more information 2653 */ 2654 obj = bp->b_bufobj->bo_object; 2655 resid = bp->b_bufsize; 2656 poffset = bp->b_offset & PAGE_MASK; 2657 VM_OBJECT_WLOCK(obj); 2658 for (i = 0; i < bp->b_npages; i++) { 2659 m = bp->b_pages[i]; 2660 if (m == bogus_page) 2661 panic("vfs_vmio_invalidate: Unexpected bogus page."); 2662 bp->b_pages[i] = NULL; 2663 2664 presid = resid > (PAGE_SIZE - poffset) ? 2665 (PAGE_SIZE - poffset) : resid; 2666 KASSERT(presid >= 0, ("brelse: extra page")); 2667 while (vm_page_xbusied(m)) { 2668 vm_page_lock(m); 2669 VM_OBJECT_WUNLOCK(obj); 2670 vm_page_busy_sleep(m, "mbncsh", true); 2671 VM_OBJECT_WLOCK(obj); 2672 } 2673 if (pmap_page_wired_mappings(m) == 0) 2674 vm_page_set_invalid(m, poffset, presid); 2675 vfs_vmio_unwire(bp, m); 2676 resid -= presid; 2677 poffset = 0; 2678 } 2679 VM_OBJECT_WUNLOCK(obj); 2680 bp->b_npages = 0; 2681 } 2682 2683 /* 2684 * Page-granular truncation of an existing VMIO buffer. 2685 */ 2686 static void 2687 vfs_vmio_truncate(struct buf *bp, int desiredpages) 2688 { 2689 vm_object_t obj; 2690 vm_page_t m; 2691 int i; 2692 2693 if (bp->b_npages == desiredpages) 2694 return; 2695 2696 if (buf_mapped(bp)) { 2697 BUF_CHECK_MAPPED(bp); 2698 pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + 2699 (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); 2700 } else 2701 BUF_CHECK_UNMAPPED(bp); 2702 obj = bp->b_bufobj->bo_object; 2703 if (obj != NULL) 2704 VM_OBJECT_WLOCK(obj); 2705 for (i = desiredpages; i < bp->b_npages; i++) { 2706 m = bp->b_pages[i]; 2707 KASSERT(m != bogus_page, ("allocbuf: bogus page found")); 2708 bp->b_pages[i] = NULL; 2709 vfs_vmio_unwire(bp, m); 2710 } 2711 if (obj != NULL) 2712 VM_OBJECT_WUNLOCK(obj); 2713 bp->b_npages = desiredpages; 2714 } 2715 2716 /* 2717 * Byte granular extension of VMIO buffers. 2718 */ 2719 static void 2720 vfs_vmio_extend(struct buf *bp, int desiredpages, int size) 2721 { 2722 /* 2723 * We are growing the buffer, possibly in a 2724 * byte-granular fashion. 2725 */ 2726 vm_object_t obj; 2727 vm_offset_t toff; 2728 vm_offset_t tinc; 2729 vm_page_t m; 2730 2731 /* 2732 * Step 1, bring in the VM pages from the object, allocating 2733 * them if necessary. We must clear B_CACHE if these pages 2734 * are not valid for the range covered by the buffer. 2735 */ 2736 obj = bp->b_bufobj->bo_object; 2737 VM_OBJECT_WLOCK(obj); 2738 while (bp->b_npages < desiredpages) { 2739 /* 2740 * We must allocate system pages since blocking 2741 * here could interfere with paging I/O, no 2742 * matter which process we are. 2743 * 2744 * Only exclusive busy can be tested here. 2745 * Blocking on shared busy might lead to 2746 * deadlocks once allocbuf() is called after 2747 * pages are vfs_busy_pages(). 2748 */ 2749 m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, 2750 VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | 2751 VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | 2752 VM_ALLOC_COUNT(desiredpages - bp->b_npages)); 2753 if (m->valid == 0) 2754 bp->b_flags &= ~B_CACHE; 2755 bp->b_pages[bp->b_npages] = m; 2756 ++bp->b_npages; 2757 } 2758 2759 /* 2760 * Step 2. We've loaded the pages into the buffer, 2761 * we have to figure out if we can still have B_CACHE 2762 * set. Note that B_CACHE is set according to the 2763 * byte-granular range ( bcount and size ), not the 2764 * aligned range ( newbsize ). 2765 * 2766 * The VM test is against m->valid, which is DEV_BSIZE 2767 * aligned. Needless to say, the validity of the data 2768 * needs to also be DEV_BSIZE aligned. Note that this 2769 * fails with NFS if the server or some other client 2770 * extends the file's EOF. If our buffer is resized, 2771 * B_CACHE may remain set! XXX 2772 */ 2773 toff = bp->b_bcount; 2774 tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); 2775 while ((bp->b_flags & B_CACHE) && toff < size) { 2776 vm_pindex_t pi; 2777 2778 if (tinc > (size - toff)) 2779 tinc = size - toff; 2780 pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; 2781 m = bp->b_pages[pi]; 2782 vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); 2783 toff += tinc; 2784 tinc = PAGE_SIZE; 2785 } 2786 VM_OBJECT_WUNLOCK(obj); 2787 2788 /* 2789 * Step 3, fixup the KVA pmap. 2790 */ 2791 if (buf_mapped(bp)) 2792 bpmap_qenter(bp); 2793 else 2794 BUF_CHECK_UNMAPPED(bp); 2795 } 2796 2797 /* 2798 * Check to see if a block at a particular lbn is available for a clustered 2799 * write. 2800 */ 2801 static int 2802 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) 2803 { 2804 struct buf *bpa; 2805 int match; 2806 2807 match = 0; 2808 2809 /* If the buf isn't in core skip it */ 2810 if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) 2811 return (0); 2812 2813 /* If the buf is busy we don't want to wait for it */ 2814 if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 2815 return (0); 2816 2817 /* Only cluster with valid clusterable delayed write buffers */ 2818 if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != 2819 (B_DELWRI | B_CLUSTEROK)) 2820 goto done; 2821 2822 if (bpa->b_bufsize != size) 2823 goto done; 2824 2825 /* 2826 * Check to see if it is in the expected place on disk and that the 2827 * block has been mapped. 2828 */ 2829 if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) 2830 match = 1; 2831 done: 2832 BUF_UNLOCK(bpa); 2833 return (match); 2834 } 2835 2836 /* 2837 * vfs_bio_awrite: 2838 * 2839 * Implement clustered async writes for clearing out B_DELWRI buffers. 2840 * This is much better then the old way of writing only one buffer at 2841 * a time. Note that we may not be presented with the buffers in the 2842 * correct order, so we search for the cluster in both directions. 2843 */ 2844 int 2845 vfs_bio_awrite(struct buf *bp) 2846 { 2847 struct bufobj *bo; 2848 int i; 2849 int j; 2850 daddr_t lblkno = bp->b_lblkno; 2851 struct vnode *vp = bp->b_vp; 2852 int ncl; 2853 int nwritten; 2854 int size; 2855 int maxcl; 2856 int gbflags; 2857 2858 bo = &vp->v_bufobj; 2859 gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; 2860 /* 2861 * right now we support clustered writing only to regular files. If 2862 * we find a clusterable block we could be in the middle of a cluster 2863 * rather then at the beginning. 2864 */ 2865 if ((vp->v_type == VREG) && 2866 (vp->v_mount != 0) && /* Only on nodes that have the size info */ 2867 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 2868 2869 size = vp->v_mount->mnt_stat.f_iosize; 2870 maxcl = MAXPHYS / size; 2871 2872 BO_RLOCK(bo); 2873 for (i = 1; i < maxcl; i++) 2874 if (vfs_bio_clcheck(vp, size, lblkno + i, 2875 bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) 2876 break; 2877 2878 for (j = 1; i + j <= maxcl && j <= lblkno; j++) 2879 if (vfs_bio_clcheck(vp, size, lblkno - j, 2880 bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) 2881 break; 2882 BO_RUNLOCK(bo); 2883 --j; 2884 ncl = i + j; 2885 /* 2886 * this is a possible cluster write 2887 */ 2888 if (ncl != 1) { 2889 BUF_UNLOCK(bp); 2890 nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, 2891 gbflags); 2892 return (nwritten); 2893 } 2894 } 2895 bremfree(bp); 2896 bp->b_flags |= B_ASYNC; 2897 /* 2898 * default (old) behavior, writing out only one block 2899 * 2900 * XXX returns b_bufsize instead of b_bcount for nwritten? 2901 */ 2902 nwritten = bp->b_bufsize; 2903 (void) bwrite(bp); 2904 2905 return (nwritten); 2906 } 2907 2908 /* 2909 * getnewbuf_kva: 2910 * 2911 * Allocate KVA for an empty buf header according to gbflags. 2912 */ 2913 static int 2914 getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) 2915 { 2916 2917 if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { 2918 /* 2919 * In order to keep fragmentation sane we only allocate kva 2920 * in BKVASIZE chunks. XXX with vmem we can do page size. 2921 */ 2922 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; 2923 2924 if (maxsize != bp->b_kvasize && 2925 bufkva_alloc(bp, maxsize, gbflags)) 2926 return (ENOSPC); 2927 } 2928 return (0); 2929 } 2930 2931 /* 2932 * getnewbuf: 2933 * 2934 * Find and initialize a new buffer header, freeing up existing buffers 2935 * in the bufqueues as necessary. The new buffer is returned locked. 2936 * 2937 * We block if: 2938 * We have insufficient buffer headers 2939 * We have insufficient buffer space 2940 * buffer_arena is too fragmented ( space reservation fails ) 2941 * If we have to flush dirty buffers ( but we try to avoid this ) 2942 * 2943 * The caller is responsible for releasing the reserved bufspace after 2944 * allocbuf() is called. 2945 */ 2946 static struct buf * 2947 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) 2948 { 2949 struct buf *bp; 2950 bool metadata, reserved; 2951 2952 bp = NULL; 2953 KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, 2954 ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); 2955 if (!unmapped_buf_allowed) 2956 gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); 2957 2958 if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || 2959 vp->v_type == VCHR) 2960 metadata = true; 2961 else 2962 metadata = false; 2963 atomic_add_int(&getnewbufcalls, 1); 2964 reserved = false; 2965 do { 2966 if (reserved == false && 2967 bufspace_reserve(maxsize, metadata) != 0) 2968 continue; 2969 reserved = true; 2970 if ((bp = buf_alloc()) == NULL) 2971 continue; 2972 if (getnewbuf_kva(bp, gbflags, maxsize) == 0) 2973 return (bp); 2974 break; 2975 } while(buf_scan(false) == 0); 2976 2977 if (reserved) 2978 atomic_subtract_long(&bufspace, maxsize); 2979 if (bp != NULL) { 2980 bp->b_flags |= B_INVAL; 2981 brelse(bp); 2982 } 2983 bufspace_wait(vp, gbflags, slpflag, slptimeo); 2984 2985 return (NULL); 2986 } 2987 2988 /* 2989 * buf_daemon: 2990 * 2991 * buffer flushing daemon. Buffers are normally flushed by the 2992 * update daemon but if it cannot keep up this process starts to 2993 * take the load in an attempt to prevent getnewbuf() from blocking. 2994 */ 2995 static struct kproc_desc buf_kp = { 2996 "bufdaemon", 2997 buf_daemon, 2998 &bufdaemonproc 2999 }; 3000 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); 3001 3002 static int 3003 buf_flush(struct vnode *vp, int target) 3004 { 3005 int flushed; 3006 3007 flushed = flushbufqueues(vp, target, 0); 3008 if (flushed == 0) { 3009 /* 3010 * Could not find any buffers without rollback 3011 * dependencies, so just write the first one 3012 * in the hopes of eventually making progress. 3013 */ 3014 if (vp != NULL && target > 2) 3015 target /= 2; 3016 flushbufqueues(vp, target, 1); 3017 } 3018 return (flushed); 3019 } 3020 3021 static void 3022 buf_daemon() 3023 { 3024 int lodirty; 3025 3026 /* 3027 * This process needs to be suspended prior to shutdown sync. 3028 */ 3029 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, 3030 SHUTDOWN_PRI_LAST); 3031 3032 /* 3033 * This process is allowed to take the buffer cache to the limit 3034 */ 3035 curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; 3036 mtx_lock(&bdlock); 3037 for (;;) { 3038 bd_request = 0; 3039 mtx_unlock(&bdlock); 3040 3041 kproc_suspend_check(bufdaemonproc); 3042 lodirty = lodirtybuffers; 3043 if (bd_speedupreq) { 3044 lodirty = numdirtybuffers / 2; 3045 bd_speedupreq = 0; 3046 } 3047 /* 3048 * Do the flush. Limit the amount of in-transit I/O we 3049 * allow to build up, otherwise we would completely saturate 3050 * the I/O system. 3051 */ 3052 while (numdirtybuffers > lodirty) { 3053 if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) 3054 break; 3055 kern_yield(PRI_USER); 3056 } 3057 3058 /* 3059 * Only clear bd_request if we have reached our low water 3060 * mark. The buf_daemon normally waits 1 second and 3061 * then incrementally flushes any dirty buffers that have 3062 * built up, within reason. 3063 * 3064 * If we were unable to hit our low water mark and couldn't 3065 * find any flushable buffers, we sleep for a short period 3066 * to avoid endless loops on unlockable buffers. 3067 */ 3068 mtx_lock(&bdlock); 3069 if (numdirtybuffers <= lodirtybuffers) { 3070 /* 3071 * We reached our low water mark, reset the 3072 * request and sleep until we are needed again. 3073 * The sleep is just so the suspend code works. 3074 */ 3075 bd_request = 0; 3076 /* 3077 * Do an extra wakeup in case dirty threshold 3078 * changed via sysctl and the explicit transition 3079 * out of shortfall was missed. 3080 */ 3081 bdirtywakeup(); 3082 if (runningbufspace <= lorunningspace) 3083 runningwakeup(); 3084 msleep(&bd_request, &bdlock, PVM, "psleep", hz); 3085 } else { 3086 /* 3087 * We couldn't find any flushable dirty buffers but 3088 * still have too many dirty buffers, we 3089 * have to sleep and try again. (rare) 3090 */ 3091 msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); 3092 } 3093 } 3094 } 3095 3096 /* 3097 * flushbufqueues: 3098 * 3099 * Try to flush a buffer in the dirty queue. We must be careful to 3100 * free up B_INVAL buffers instead of write them, which NFS is 3101 * particularly sensitive to. 3102 */ 3103 static int flushwithdeps = 0; 3104 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 3105 0, "Number of buffers flushed with dependecies that require rollbacks"); 3106 3107 static int 3108 flushbufqueues(struct vnode *lvp, int target, int flushdeps) 3109 { 3110 struct buf *sentinel; 3111 struct vnode *vp; 3112 struct mount *mp; 3113 struct buf *bp; 3114 int hasdeps; 3115 int flushed; 3116 int queue; 3117 int error; 3118 bool unlock; 3119 3120 flushed = 0; 3121 queue = QUEUE_DIRTY; 3122 bp = NULL; 3123 sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); 3124 sentinel->b_qindex = QUEUE_SENTINEL; 3125 mtx_lock(&bqlocks[queue]); 3126 TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); 3127 mtx_unlock(&bqlocks[queue]); 3128 while (flushed != target) { 3129 maybe_yield(); 3130 mtx_lock(&bqlocks[queue]); 3131 bp = TAILQ_NEXT(sentinel, b_freelist); 3132 if (bp != NULL) { 3133 TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); 3134 TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, 3135 b_freelist); 3136 } else { 3137 mtx_unlock(&bqlocks[queue]); 3138 break; 3139 } 3140 /* 3141 * Skip sentinels inserted by other invocations of the 3142 * flushbufqueues(), taking care to not reorder them. 3143 * 3144 * Only flush the buffers that belong to the 3145 * vnode locked by the curthread. 3146 */ 3147 if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && 3148 bp->b_vp != lvp)) { 3149 mtx_unlock(&bqlocks[queue]); 3150 continue; 3151 } 3152 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); 3153 mtx_unlock(&bqlocks[queue]); 3154 if (error != 0) 3155 continue; 3156 3157 /* 3158 * BKGRDINPROG can only be set with the buf and bufobj 3159 * locks both held. We tolerate a race to clear it here. 3160 */ 3161 if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || 3162 (bp->b_flags & B_DELWRI) == 0) { 3163 BUF_UNLOCK(bp); 3164 continue; 3165 } 3166 if (bp->b_flags & B_INVAL) { 3167 bremfreef(bp); 3168 brelse(bp); 3169 flushed++; 3170 continue; 3171 } 3172 3173 if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { 3174 if (flushdeps == 0) { 3175 BUF_UNLOCK(bp); 3176 continue; 3177 } 3178 hasdeps = 1; 3179 } else 3180 hasdeps = 0; 3181 /* 3182 * We must hold the lock on a vnode before writing 3183 * one of its buffers. Otherwise we may confuse, or 3184 * in the case of a snapshot vnode, deadlock the 3185 * system. 3186 * 3187 * The lock order here is the reverse of the normal 3188 * of vnode followed by buf lock. This is ok because 3189 * the NOWAIT will prevent deadlock. 3190 */ 3191 vp = bp->b_vp; 3192 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 3193 BUF_UNLOCK(bp); 3194 continue; 3195 } 3196 if (lvp == NULL) { 3197 unlock = true; 3198 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); 3199 } else { 3200 ASSERT_VOP_LOCKED(vp, "getbuf"); 3201 unlock = false; 3202 error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : 3203 vn_lock(vp, LK_TRYUPGRADE); 3204 } 3205 if (error == 0) { 3206 CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", 3207 bp, bp->b_vp, bp->b_flags); 3208 if (curproc == bufdaemonproc) { 3209 vfs_bio_awrite(bp); 3210 } else { 3211 bremfree(bp); 3212 bwrite(bp); 3213 notbufdflushes++; 3214 } 3215 vn_finished_write(mp); 3216 if (unlock) 3217 VOP_UNLOCK(vp, 0); 3218 flushwithdeps += hasdeps; 3219 flushed++; 3220 3221 /* 3222 * Sleeping on runningbufspace while holding 3223 * vnode lock leads to deadlock. 3224 */ 3225 if (curproc == bufdaemonproc && 3226 runningbufspace > hirunningspace) 3227 waitrunningbufspace(); 3228 continue; 3229 } 3230 vn_finished_write(mp); 3231 BUF_UNLOCK(bp); 3232 } 3233 mtx_lock(&bqlocks[queue]); 3234 TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); 3235 mtx_unlock(&bqlocks[queue]); 3236 free(sentinel, M_TEMP); 3237 return (flushed); 3238 } 3239 3240 /* 3241 * Check to see if a block is currently memory resident. 3242 */ 3243 struct buf * 3244 incore(struct bufobj *bo, daddr_t blkno) 3245 { 3246 struct buf *bp; 3247 3248 BO_RLOCK(bo); 3249 bp = gbincore(bo, blkno); 3250 BO_RUNLOCK(bo); 3251 return (bp); 3252 } 3253 3254 /* 3255 * Returns true if no I/O is needed to access the 3256 * associated VM object. This is like incore except 3257 * it also hunts around in the VM system for the data. 3258 */ 3259 3260 static int 3261 inmem(struct vnode * vp, daddr_t blkno) 3262 { 3263 vm_object_t obj; 3264 vm_offset_t toff, tinc, size; 3265 vm_page_t m; 3266 vm_ooffset_t off; 3267 3268 ASSERT_VOP_LOCKED(vp, "inmem"); 3269 3270 if (incore(&vp->v_bufobj, blkno)) 3271 return 1; 3272 if (vp->v_mount == NULL) 3273 return 0; 3274 obj = vp->v_object; 3275 if (obj == NULL) 3276 return (0); 3277 3278 size = PAGE_SIZE; 3279 if (size > vp->v_mount->mnt_stat.f_iosize) 3280 size = vp->v_mount->mnt_stat.f_iosize; 3281 off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; 3282 3283 VM_OBJECT_RLOCK(obj); 3284 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 3285 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 3286 if (!m) 3287 goto notinmem; 3288 tinc = size; 3289 if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) 3290 tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); 3291 if (vm_page_is_valid(m, 3292 (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) 3293 goto notinmem; 3294 } 3295 VM_OBJECT_RUNLOCK(obj); 3296 return 1; 3297 3298 notinmem: 3299 VM_OBJECT_RUNLOCK(obj); 3300 return (0); 3301 } 3302 3303 /* 3304 * Set the dirty range for a buffer based on the status of the dirty 3305 * bits in the pages comprising the buffer. The range is limited 3306 * to the size of the buffer. 3307 * 3308 * Tell the VM system that the pages associated with this buffer 3309 * are clean. This is used for delayed writes where the data is 3310 * going to go to disk eventually without additional VM intevention. 3311 * 3312 * Note that while we only really need to clean through to b_bcount, we 3313 * just go ahead and clean through to b_bufsize. 3314 */ 3315 static void 3316 vfs_clean_pages_dirty_buf(struct buf *bp) 3317 { 3318 vm_ooffset_t foff, noff, eoff; 3319 vm_page_t m; 3320 int i; 3321 3322 if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) 3323 return; 3324 3325 foff = bp->b_offset; 3326 KASSERT(bp->b_offset != NOOFFSET, 3327 ("vfs_clean_pages_dirty_buf: no buffer offset")); 3328 3329 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 3330 vfs_drain_busy_pages(bp); 3331 vfs_setdirty_locked_object(bp); 3332 for (i = 0; i < bp->b_npages; i++) { 3333 noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3334 eoff = noff; 3335 if (eoff > bp->b_offset + bp->b_bufsize) 3336 eoff = bp->b_offset + bp->b_bufsize; 3337 m = bp->b_pages[i]; 3338 vfs_page_set_validclean(bp, foff, m); 3339 /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ 3340 foff = noff; 3341 } 3342 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 3343 } 3344 3345 static void 3346 vfs_setdirty_locked_object(struct buf *bp) 3347 { 3348 vm_object_t object; 3349 int i; 3350 3351 object = bp->b_bufobj->bo_object; 3352 VM_OBJECT_ASSERT_WLOCKED(object); 3353 3354 /* 3355 * We qualify the scan for modified pages on whether the 3356 * object has been flushed yet. 3357 */ 3358 if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { 3359 vm_offset_t boffset; 3360 vm_offset_t eoffset; 3361 3362 /* 3363 * test the pages to see if they have been modified directly 3364 * by users through the VM system. 3365 */ 3366 for (i = 0; i < bp->b_npages; i++) 3367 vm_page_test_dirty(bp->b_pages[i]); 3368 3369 /* 3370 * Calculate the encompassing dirty range, boffset and eoffset, 3371 * (eoffset - boffset) bytes. 3372 */ 3373 3374 for (i = 0; i < bp->b_npages; i++) { 3375 if (bp->b_pages[i]->dirty) 3376 break; 3377 } 3378 boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 3379 3380 for (i = bp->b_npages - 1; i >= 0; --i) { 3381 if (bp->b_pages[i]->dirty) { 3382 break; 3383 } 3384 } 3385 eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 3386 3387 /* 3388 * Fit it to the buffer. 3389 */ 3390 3391 if (eoffset > bp->b_bcount) 3392 eoffset = bp->b_bcount; 3393 3394 /* 3395 * If we have a good dirty range, merge with the existing 3396 * dirty range. 3397 */ 3398 3399 if (boffset < eoffset) { 3400 if (bp->b_dirtyoff > boffset) 3401 bp->b_dirtyoff = boffset; 3402 if (bp->b_dirtyend < eoffset) 3403 bp->b_dirtyend = eoffset; 3404 } 3405 } 3406 } 3407 3408 /* 3409 * Allocate the KVA mapping for an existing buffer. 3410 * If an unmapped buffer is provided but a mapped buffer is requested, take 3411 * also care to properly setup mappings between pages and KVA. 3412 */ 3413 static void 3414 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) 3415 { 3416 int bsize, maxsize, need_mapping, need_kva; 3417 off_t offset; 3418 3419 need_mapping = bp->b_data == unmapped_buf && 3420 (gbflags & GB_UNMAPPED) == 0; 3421 need_kva = bp->b_kvabase == unmapped_buf && 3422 bp->b_data == unmapped_buf && 3423 (gbflags & GB_KVAALLOC) != 0; 3424 if (!need_mapping && !need_kva) 3425 return; 3426 3427 BUF_CHECK_UNMAPPED(bp); 3428 3429 if (need_mapping && bp->b_kvabase != unmapped_buf) { 3430 /* 3431 * Buffer is not mapped, but the KVA was already 3432 * reserved at the time of the instantiation. Use the 3433 * allocated space. 3434 */ 3435 goto has_addr; 3436 } 3437 3438 /* 3439 * Calculate the amount of the address space we would reserve 3440 * if the buffer was mapped. 3441 */ 3442 bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; 3443 KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); 3444 offset = blkno * bsize; 3445 maxsize = size + (offset & PAGE_MASK); 3446 maxsize = imax(maxsize, bsize); 3447 3448 while (bufkva_alloc(bp, maxsize, gbflags) != 0) { 3449 if ((gbflags & GB_NOWAIT_BD) != 0) { 3450 /* 3451 * XXXKIB: defragmentation cannot 3452 * succeed, not sure what else to do. 3453 */ 3454 panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); 3455 } 3456 atomic_add_int(&mappingrestarts, 1); 3457 bufspace_wait(bp->b_vp, gbflags, 0, 0); 3458 } 3459 has_addr: 3460 if (need_mapping) { 3461 /* b_offset is handled by bpmap_qenter. */ 3462 bp->b_data = bp->b_kvabase; 3463 BUF_CHECK_MAPPED(bp); 3464 bpmap_qenter(bp); 3465 } 3466 } 3467 3468 /* 3469 * getblk: 3470 * 3471 * Get a block given a specified block and offset into a file/device. 3472 * The buffers B_DONE bit will be cleared on return, making it almost 3473 * ready for an I/O initiation. B_INVAL may or may not be set on 3474 * return. The caller should clear B_INVAL prior to initiating a 3475 * READ. 3476 * 3477 * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 3478 * an existing buffer. 3479 * 3480 * For a VMIO buffer, B_CACHE is modified according to the backing VM. 3481 * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 3482 * and then cleared based on the backing VM. If the previous buffer is 3483 * non-0-sized but invalid, B_CACHE will be cleared. 3484 * 3485 * If getblk() must create a new buffer, the new buffer is returned with 3486 * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 3487 * case it is returned with B_INVAL clear and B_CACHE set based on the 3488 * backing VM. 3489 * 3490 * getblk() also forces a bwrite() for any B_DELWRI buffer whos 3491 * B_CACHE bit is clear. 3492 * 3493 * What this means, basically, is that the caller should use B_CACHE to 3494 * determine whether the buffer is fully valid or not and should clear 3495 * B_INVAL prior to issuing a read. If the caller intends to validate 3496 * the buffer by loading its data area with something, the caller needs 3497 * to clear B_INVAL. If the caller does this without issuing an I/O, 3498 * the caller should set B_CACHE ( as an optimization ), else the caller 3499 * should issue the I/O and biodone() will set B_CACHE if the I/O was 3500 * a write attempt or if it was a successful read. If the caller 3501 * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR 3502 * prior to issuing the READ. biodone() will *not* clear B_INVAL. 3503 */ 3504 struct buf * 3505 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, 3506 int flags) 3507 { 3508 struct buf *bp; 3509 struct bufobj *bo; 3510 int bsize, error, maxsize, vmio; 3511 off_t offset; 3512 3513 CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); 3514 KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, 3515 ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); 3516 ASSERT_VOP_LOCKED(vp, "getblk"); 3517 if (size > maxbcachebuf) 3518 panic("getblk: size(%d) > maxbcachebuf(%d)\n", size, 3519 maxbcachebuf); 3520 if (!unmapped_buf_allowed) 3521 flags &= ~(GB_UNMAPPED | GB_KVAALLOC); 3522 3523 bo = &vp->v_bufobj; 3524 loop: 3525 BO_RLOCK(bo); 3526 bp = gbincore(bo, blkno); 3527 if (bp != NULL) { 3528 int lockflags; 3529 /* 3530 * Buffer is in-core. If the buffer is not busy nor managed, 3531 * it must be on a queue. 3532 */ 3533 lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; 3534 3535 if (flags & GB_LOCK_NOWAIT) 3536 lockflags |= LK_NOWAIT; 3537 3538 error = BUF_TIMELOCK(bp, lockflags, 3539 BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); 3540 3541 /* 3542 * If we slept and got the lock we have to restart in case 3543 * the buffer changed identities. 3544 */ 3545 if (error == ENOLCK) 3546 goto loop; 3547 /* We timed out or were interrupted. */ 3548 else if (error) 3549 return (NULL); 3550 /* If recursed, assume caller knows the rules. */ 3551 else if (BUF_LOCKRECURSED(bp)) 3552 goto end; 3553 3554 /* 3555 * The buffer is locked. B_CACHE is cleared if the buffer is 3556 * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set 3557 * and for a VMIO buffer B_CACHE is adjusted according to the 3558 * backing VM cache. 3559 */ 3560 if (bp->b_flags & B_INVAL) 3561 bp->b_flags &= ~B_CACHE; 3562 else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) 3563 bp->b_flags |= B_CACHE; 3564 if (bp->b_flags & B_MANAGED) 3565 MPASS(bp->b_qindex == QUEUE_NONE); 3566 else 3567 bremfree(bp); 3568 3569 /* 3570 * check for size inconsistencies for non-VMIO case. 3571 */ 3572 if (bp->b_bcount != size) { 3573 if ((bp->b_flags & B_VMIO) == 0 || 3574 (size > bp->b_kvasize)) { 3575 if (bp->b_flags & B_DELWRI) { 3576 bp->b_flags |= B_NOCACHE; 3577 bwrite(bp); 3578 } else { 3579 if (LIST_EMPTY(&bp->b_dep)) { 3580 bp->b_flags |= B_RELBUF; 3581 brelse(bp); 3582 } else { 3583 bp->b_flags |= B_NOCACHE; 3584 bwrite(bp); 3585 } 3586 } 3587 goto loop; 3588 } 3589 } 3590 3591 /* 3592 * Handle the case of unmapped buffer which should 3593 * become mapped, or the buffer for which KVA 3594 * reservation is requested. 3595 */ 3596 bp_unmapped_get_kva(bp, blkno, size, flags); 3597 3598 /* 3599 * If the size is inconsistent in the VMIO case, we can resize 3600 * the buffer. This might lead to B_CACHE getting set or 3601 * cleared. If the size has not changed, B_CACHE remains 3602 * unchanged from its previous state. 3603 */ 3604 allocbuf(bp, size); 3605 3606 KASSERT(bp->b_offset != NOOFFSET, 3607 ("getblk: no buffer offset")); 3608 3609 /* 3610 * A buffer with B_DELWRI set and B_CACHE clear must 3611 * be committed before we can return the buffer in 3612 * order to prevent the caller from issuing a read 3613 * ( due to B_CACHE not being set ) and overwriting 3614 * it. 3615 * 3616 * Most callers, including NFS and FFS, need this to 3617 * operate properly either because they assume they 3618 * can issue a read if B_CACHE is not set, or because 3619 * ( for example ) an uncached B_DELWRI might loop due 3620 * to softupdates re-dirtying the buffer. In the latter 3621 * case, B_CACHE is set after the first write completes, 3622 * preventing further loops. 3623 * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE 3624 * above while extending the buffer, we cannot allow the 3625 * buffer to remain with B_CACHE set after the write 3626 * completes or it will represent a corrupt state. To 3627 * deal with this we set B_NOCACHE to scrap the buffer 3628 * after the write. 3629 * 3630 * We might be able to do something fancy, like setting 3631 * B_CACHE in bwrite() except if B_DELWRI is already set, 3632 * so the below call doesn't set B_CACHE, but that gets real 3633 * confusing. This is much easier. 3634 */ 3635 3636 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 3637 bp->b_flags |= B_NOCACHE; 3638 bwrite(bp); 3639 goto loop; 3640 } 3641 bp->b_flags &= ~B_DONE; 3642 } else { 3643 /* 3644 * Buffer is not in-core, create new buffer. The buffer 3645 * returned by getnewbuf() is locked. Note that the returned 3646 * buffer is also considered valid (not marked B_INVAL). 3647 */ 3648 BO_RUNLOCK(bo); 3649 /* 3650 * If the user does not want us to create the buffer, bail out 3651 * here. 3652 */ 3653 if (flags & GB_NOCREAT) 3654 return NULL; 3655 if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) 3656 return NULL; 3657 3658 bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; 3659 KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); 3660 offset = blkno * bsize; 3661 vmio = vp->v_object != NULL; 3662 if (vmio) { 3663 maxsize = size + (offset & PAGE_MASK); 3664 } else { 3665 maxsize = size; 3666 /* Do not allow non-VMIO notmapped buffers. */ 3667 flags &= ~(GB_UNMAPPED | GB_KVAALLOC); 3668 } 3669 maxsize = imax(maxsize, bsize); 3670 3671 bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); 3672 if (bp == NULL) { 3673 if (slpflag || slptimeo) 3674 return NULL; 3675 /* 3676 * XXX This is here until the sleep path is diagnosed 3677 * enough to work under very low memory conditions. 3678 * 3679 * There's an issue on low memory, 4BSD+non-preempt 3680 * systems (eg MIPS routers with 32MB RAM) where buffer 3681 * exhaustion occurs without sleeping for buffer 3682 * reclaimation. This just sticks in a loop and 3683 * constantly attempts to allocate a buffer, which 3684 * hits exhaustion and tries to wakeup bufdaemon. 3685 * This never happens because we never yield. 3686 * 3687 * The real solution is to identify and fix these cases 3688 * so we aren't effectively busy-waiting in a loop 3689 * until the reclaimation path has cycles to run. 3690 */ 3691 kern_yield(PRI_USER); 3692 goto loop; 3693 } 3694 3695 /* 3696 * This code is used to make sure that a buffer is not 3697 * created while the getnewbuf routine is blocked. 3698 * This can be a problem whether the vnode is locked or not. 3699 * If the buffer is created out from under us, we have to 3700 * throw away the one we just created. 3701 * 3702 * Note: this must occur before we associate the buffer 3703 * with the vp especially considering limitations in 3704 * the splay tree implementation when dealing with duplicate 3705 * lblkno's. 3706 */ 3707 BO_LOCK(bo); 3708 if (gbincore(bo, blkno)) { 3709 BO_UNLOCK(bo); 3710 bp->b_flags |= B_INVAL; 3711 brelse(bp); 3712 bufspace_release(maxsize); 3713 goto loop; 3714 } 3715 3716 /* 3717 * Insert the buffer into the hash, so that it can 3718 * be found by incore. 3719 */ 3720 bp->b_blkno = bp->b_lblkno = blkno; 3721 bp->b_offset = offset; 3722 bgetvp(vp, bp); 3723 BO_UNLOCK(bo); 3724 3725 /* 3726 * set B_VMIO bit. allocbuf() the buffer bigger. Since the 3727 * buffer size starts out as 0, B_CACHE will be set by 3728 * allocbuf() for the VMIO case prior to it testing the 3729 * backing store for validity. 3730 */ 3731 3732 if (vmio) { 3733 bp->b_flags |= B_VMIO; 3734 KASSERT(vp->v_object == bp->b_bufobj->bo_object, 3735 ("ARGH! different b_bufobj->bo_object %p %p %p\n", 3736 bp, vp->v_object, bp->b_bufobj->bo_object)); 3737 } else { 3738 bp->b_flags &= ~B_VMIO; 3739 KASSERT(bp->b_bufobj->bo_object == NULL, 3740 ("ARGH! has b_bufobj->bo_object %p %p\n", 3741 bp, bp->b_bufobj->bo_object)); 3742 BUF_CHECK_MAPPED(bp); 3743 } 3744 3745 allocbuf(bp, size); 3746 bufspace_release(maxsize); 3747 bp->b_flags &= ~B_DONE; 3748 } 3749 CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); 3750 BUF_ASSERT_HELD(bp); 3751 end: 3752 buf_track(bp, __func__); 3753 KASSERT(bp->b_bufobj == bo, 3754 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3755 return (bp); 3756 } 3757 3758 /* 3759 * Get an empty, disassociated buffer of given size. The buffer is initially 3760 * set to B_INVAL. 3761 */ 3762 struct buf * 3763 geteblk(int size, int flags) 3764 { 3765 struct buf *bp; 3766 int maxsize; 3767 3768 maxsize = (size + BKVAMASK) & ~BKVAMASK; 3769 while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { 3770 if ((flags & GB_NOWAIT_BD) && 3771 (curthread->td_pflags & TDP_BUFNEED) != 0) 3772 return (NULL); 3773 } 3774 allocbuf(bp, size); 3775 bufspace_release(maxsize); 3776 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 3777 BUF_ASSERT_HELD(bp); 3778 return (bp); 3779 } 3780 3781 /* 3782 * Truncate the backing store for a non-vmio buffer. 3783 */ 3784 static void 3785 vfs_nonvmio_truncate(struct buf *bp, int newbsize) 3786 { 3787 3788 if (bp->b_flags & B_MALLOC) { 3789 /* 3790 * malloced buffers are not shrunk 3791 */ 3792 if (newbsize == 0) { 3793 bufmallocadjust(bp, 0); 3794 free(bp->b_data, M_BIOBUF); 3795 bp->b_data = bp->b_kvabase; 3796 bp->b_flags &= ~B_MALLOC; 3797 } 3798 return; 3799 } 3800 vm_hold_free_pages(bp, newbsize); 3801 bufspace_adjust(bp, newbsize); 3802 } 3803 3804 /* 3805 * Extend the backing for a non-VMIO buffer. 3806 */ 3807 static void 3808 vfs_nonvmio_extend(struct buf *bp, int newbsize) 3809 { 3810 caddr_t origbuf; 3811 int origbufsize; 3812 3813 /* 3814 * We only use malloced memory on the first allocation. 3815 * and revert to page-allocated memory when the buffer 3816 * grows. 3817 * 3818 * There is a potential smp race here that could lead 3819 * to bufmallocspace slightly passing the max. It 3820 * is probably extremely rare and not worth worrying 3821 * over. 3822 */ 3823 if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && 3824 bufmallocspace < maxbufmallocspace) { 3825 bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); 3826 bp->b_flags |= B_MALLOC; 3827 bufmallocadjust(bp, newbsize); 3828 return; 3829 } 3830 3831 /* 3832 * If the buffer is growing on its other-than-first 3833 * allocation then we revert to the page-allocation 3834 * scheme. 3835 */ 3836 origbuf = NULL; 3837 origbufsize = 0; 3838 if (bp->b_flags & B_MALLOC) { 3839 origbuf = bp->b_data; 3840 origbufsize = bp->b_bufsize; 3841 bp->b_data = bp->b_kvabase; 3842 bufmallocadjust(bp, 0); 3843 bp->b_flags &= ~B_MALLOC; 3844 newbsize = round_page(newbsize); 3845 } 3846 vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, 3847 (vm_offset_t) bp->b_data + newbsize); 3848 if (origbuf != NULL) { 3849 bcopy(origbuf, bp->b_data, origbufsize); 3850 free(origbuf, M_BIOBUF); 3851 } 3852 bufspace_adjust(bp, newbsize); 3853 } 3854 3855 /* 3856 * This code constitutes the buffer memory from either anonymous system 3857 * memory (in the case of non-VMIO operations) or from an associated 3858 * VM object (in the case of VMIO operations). This code is able to 3859 * resize a buffer up or down. 3860 * 3861 * Note that this code is tricky, and has many complications to resolve 3862 * deadlock or inconsistent data situations. Tread lightly!!! 3863 * There are B_CACHE and B_DELWRI interactions that must be dealt with by 3864 * the caller. Calling this code willy nilly can result in the loss of data. 3865 * 3866 * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 3867 * B_CACHE for the non-VMIO case. 3868 */ 3869 int 3870 allocbuf(struct buf *bp, int size) 3871 { 3872 int newbsize; 3873 3874 BUF_ASSERT_HELD(bp); 3875 3876 if (bp->b_bcount == size) 3877 return (1); 3878 3879 if (bp->b_kvasize != 0 && bp->b_kvasize < size) 3880 panic("allocbuf: buffer too small"); 3881 3882 newbsize = roundup2(size, DEV_BSIZE); 3883 if ((bp->b_flags & B_VMIO) == 0) { 3884 if ((bp->b_flags & B_MALLOC) == 0) 3885 newbsize = round_page(newbsize); 3886 /* 3887 * Just get anonymous memory from the kernel. Don't 3888 * mess with B_CACHE. 3889 */ 3890 if (newbsize < bp->b_bufsize) 3891 vfs_nonvmio_truncate(bp, newbsize); 3892 else if (newbsize > bp->b_bufsize) 3893 vfs_nonvmio_extend(bp, newbsize); 3894 } else { 3895 int desiredpages; 3896 3897 desiredpages = (size == 0) ? 0 : 3898 num_pages((bp->b_offset & PAGE_MASK) + newbsize); 3899 3900 if (bp->b_flags & B_MALLOC) 3901 panic("allocbuf: VMIO buffer can't be malloced"); 3902 /* 3903 * Set B_CACHE initially if buffer is 0 length or will become 3904 * 0-length. 3905 */ 3906 if (size == 0 || bp->b_bufsize == 0) 3907 bp->b_flags |= B_CACHE; 3908 3909 if (newbsize < bp->b_bufsize) 3910 vfs_vmio_truncate(bp, desiredpages); 3911 /* XXX This looks as if it should be newbsize > b_bufsize */ 3912 else if (size > bp->b_bcount) 3913 vfs_vmio_extend(bp, desiredpages, size); 3914 bufspace_adjust(bp, newbsize); 3915 } 3916 bp->b_bcount = size; /* requested buffer size. */ 3917 return (1); 3918 } 3919 3920 extern int inflight_transient_maps; 3921 3922 void 3923 biodone(struct bio *bp) 3924 { 3925 struct mtx *mtxp; 3926 void (*done)(struct bio *); 3927 vm_offset_t start, end; 3928 3929 biotrack(bp, __func__); 3930 if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { 3931 bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; 3932 bp->bio_flags |= BIO_UNMAPPED; 3933 start = trunc_page((vm_offset_t)bp->bio_data); 3934 end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); 3935 bp->bio_data = unmapped_buf; 3936 pmap_qremove(start, atop(end - start)); 3937 vmem_free(transient_arena, start, end - start); 3938 atomic_add_int(&inflight_transient_maps, -1); 3939 } 3940 done = bp->bio_done; 3941 if (done == NULL) { 3942 mtxp = mtx_pool_find(mtxpool_sleep, bp); 3943 mtx_lock(mtxp); 3944 bp->bio_flags |= BIO_DONE; 3945 wakeup(bp); 3946 mtx_unlock(mtxp); 3947 } else 3948 done(bp); 3949 } 3950 3951 /* 3952 * Wait for a BIO to finish. 3953 */ 3954 int 3955 biowait(struct bio *bp, const char *wchan) 3956 { 3957 struct mtx *mtxp; 3958 3959 mtxp = mtx_pool_find(mtxpool_sleep, bp); 3960 mtx_lock(mtxp); 3961 while ((bp->bio_flags & BIO_DONE) == 0) 3962 msleep(bp, mtxp, PRIBIO, wchan, 0); 3963 mtx_unlock(mtxp); 3964 if (bp->bio_error != 0) 3965 return (bp->bio_error); 3966 if (!(bp->bio_flags & BIO_ERROR)) 3967 return (0); 3968 return (EIO); 3969 } 3970 3971 void 3972 biofinish(struct bio *bp, struct devstat *stat, int error) 3973 { 3974 3975 if (error) { 3976 bp->bio_error = error; 3977 bp->bio_flags |= BIO_ERROR; 3978 } 3979 if (stat != NULL) 3980 devstat_end_transaction_bio(stat, bp); 3981 biodone(bp); 3982 } 3983 3984 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) 3985 void 3986 biotrack_buf(struct bio *bp, const char *location) 3987 { 3988 3989 buf_track(bp->bio_track_bp, location); 3990 } 3991 #endif 3992 3993 /* 3994 * bufwait: 3995 * 3996 * Wait for buffer I/O completion, returning error status. The buffer 3997 * is left locked and B_DONE on return. B_EINTR is converted into an EINTR 3998 * error and cleared. 3999 */ 4000 int 4001 bufwait(struct buf *bp) 4002 { 4003 if (bp->b_iocmd == BIO_READ) 4004 bwait(bp, PRIBIO, "biord"); 4005 else 4006 bwait(bp, PRIBIO, "biowr"); 4007 if (bp->b_flags & B_EINTR) { 4008 bp->b_flags &= ~B_EINTR; 4009 return (EINTR); 4010 } 4011 if (bp->b_ioflags & BIO_ERROR) { 4012 return (bp->b_error ? bp->b_error : EIO); 4013 } else { 4014 return (0); 4015 } 4016 } 4017 4018 /* 4019 * bufdone: 4020 * 4021 * Finish I/O on a buffer, optionally calling a completion function. 4022 * This is usually called from an interrupt so process blocking is 4023 * not allowed. 4024 * 4025 * biodone is also responsible for setting B_CACHE in a B_VMIO bp. 4026 * In a non-VMIO bp, B_CACHE will be set on the next getblk() 4027 * assuming B_INVAL is clear. 4028 * 4029 * For the VMIO case, we set B_CACHE if the op was a read and no 4030 * read error occurred, or if the op was a write. B_CACHE is never 4031 * set if the buffer is invalid or otherwise uncacheable. 4032 * 4033 * biodone does not mess with B_INVAL, allowing the I/O routine or the 4034 * initiator to leave B_INVAL set to brelse the buffer out of existence 4035 * in the biodone routine. 4036 */ 4037 void 4038 bufdone(struct buf *bp) 4039 { 4040 struct bufobj *dropobj; 4041 void (*biodone)(struct buf *); 4042 4043 buf_track(bp, __func__); 4044 CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 4045 dropobj = NULL; 4046 4047 KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); 4048 BUF_ASSERT_HELD(bp); 4049 4050 runningbufwakeup(bp); 4051 if (bp->b_iocmd == BIO_WRITE) 4052 dropobj = bp->b_bufobj; 4053 /* call optional completion function if requested */ 4054 if (bp->b_iodone != NULL) { 4055 biodone = bp->b_iodone; 4056 bp->b_iodone = NULL; 4057 (*biodone) (bp); 4058 if (dropobj) 4059 bufobj_wdrop(dropobj); 4060 return; 4061 } 4062 4063 bufdone_finish(bp); 4064 4065 if (dropobj) 4066 bufobj_wdrop(dropobj); 4067 } 4068 4069 void 4070 bufdone_finish(struct buf *bp) 4071 { 4072 BUF_ASSERT_HELD(bp); 4073 4074 if (!LIST_EMPTY(&bp->b_dep)) 4075 buf_complete(bp); 4076 4077 if (bp->b_flags & B_VMIO) { 4078 /* 4079 * Set B_CACHE if the op was a normal read and no error 4080 * occurred. B_CACHE is set for writes in the b*write() 4081 * routines. 4082 */ 4083 if (bp->b_iocmd == BIO_READ && 4084 !(bp->b_flags & (B_INVAL|B_NOCACHE)) && 4085 !(bp->b_ioflags & BIO_ERROR)) 4086 bp->b_flags |= B_CACHE; 4087 vfs_vmio_iodone(bp); 4088 } 4089 4090 /* 4091 * For asynchronous completions, release the buffer now. The brelse 4092 * will do a wakeup there if necessary - so no need to do a wakeup 4093 * here in the async case. The sync case always needs to do a wakeup. 4094 */ 4095 if (bp->b_flags & B_ASYNC) { 4096 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || 4097 (bp->b_ioflags & BIO_ERROR)) 4098 brelse(bp); 4099 else 4100 bqrelse(bp); 4101 } else 4102 bdone(bp); 4103 } 4104 4105 /* 4106 * This routine is called in lieu of iodone in the case of 4107 * incomplete I/O. This keeps the busy status for pages 4108 * consistent. 4109 */ 4110 void 4111 vfs_unbusy_pages(struct buf *bp) 4112 { 4113 int i; 4114 vm_object_t obj; 4115 vm_page_t m; 4116 4117 runningbufwakeup(bp); 4118 if (!(bp->b_flags & B_VMIO)) 4119 return; 4120 4121 obj = bp->b_bufobj->bo_object; 4122 VM_OBJECT_WLOCK(obj); 4123 for (i = 0; i < bp->b_npages; i++) { 4124 m = bp->b_pages[i]; 4125 if (m == bogus_page) { 4126 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); 4127 if (!m) 4128 panic("vfs_unbusy_pages: page missing\n"); 4129 bp->b_pages[i] = m; 4130 if (buf_mapped(bp)) { 4131 BUF_CHECK_MAPPED(bp); 4132 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 4133 bp->b_pages, bp->b_npages); 4134 } else 4135 BUF_CHECK_UNMAPPED(bp); 4136 } 4137 vm_page_sunbusy(m); 4138 } 4139 vm_object_pip_wakeupn(obj, bp->b_npages); 4140 VM_OBJECT_WUNLOCK(obj); 4141 } 4142 4143 /* 4144 * vfs_page_set_valid: 4145 * 4146 * Set the valid bits in a page based on the supplied offset. The 4147 * range is restricted to the buffer's size. 4148 * 4149 * This routine is typically called after a read completes. 4150 */ 4151 static void 4152 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) 4153 { 4154 vm_ooffset_t eoff; 4155 4156 /* 4157 * Compute the end offset, eoff, such that [off, eoff) does not span a 4158 * page boundary and eoff is not greater than the end of the buffer. 4159 * The end of the buffer, in this case, is our file EOF, not the 4160 * allocation size of the buffer. 4161 */ 4162 eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; 4163 if (eoff > bp->b_offset + bp->b_bcount) 4164 eoff = bp->b_offset + bp->b_bcount; 4165 4166 /* 4167 * Set valid range. This is typically the entire buffer and thus the 4168 * entire page. 4169 */ 4170 if (eoff > off) 4171 vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); 4172 } 4173 4174 /* 4175 * vfs_page_set_validclean: 4176 * 4177 * Set the valid bits and clear the dirty bits in a page based on the 4178 * supplied offset. The range is restricted to the buffer's size. 4179 */ 4180 static void 4181 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) 4182 { 4183 vm_ooffset_t soff, eoff; 4184 4185 /* 4186 * Start and end offsets in buffer. eoff - soff may not cross a 4187 * page boundary or cross the end of the buffer. The end of the 4188 * buffer, in this case, is our file EOF, not the allocation size 4189 * of the buffer. 4190 */ 4191 soff = off; 4192 eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; 4193 if (eoff > bp->b_offset + bp->b_bcount) 4194 eoff = bp->b_offset + bp->b_bcount; 4195 4196 /* 4197 * Set valid range. This is typically the entire buffer and thus the 4198 * entire page. 4199 */ 4200 if (eoff > soff) { 4201 vm_page_set_validclean( 4202 m, 4203 (vm_offset_t) (soff & PAGE_MASK), 4204 (vm_offset_t) (eoff - soff) 4205 ); 4206 } 4207 } 4208 4209 /* 4210 * Ensure that all buffer pages are not exclusive busied. If any page is 4211 * exclusive busy, drain it. 4212 */ 4213 void 4214 vfs_drain_busy_pages(struct buf *bp) 4215 { 4216 vm_page_t m; 4217 int i, last_busied; 4218 4219 VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); 4220 last_busied = 0; 4221 for (i = 0; i < bp->b_npages; i++) { 4222 m = bp->b_pages[i]; 4223 if (vm_page_xbusied(m)) { 4224 for (; last_busied < i; last_busied++) 4225 vm_page_sbusy(bp->b_pages[last_busied]); 4226 while (vm_page_xbusied(m)) { 4227 vm_page_lock(m); 4228 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 4229 vm_page_busy_sleep(m, "vbpage", true); 4230 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 4231 } 4232 } 4233 } 4234 for (i = 0; i < last_busied; i++) 4235 vm_page_sunbusy(bp->b_pages[i]); 4236 } 4237 4238 /* 4239 * This routine is called before a device strategy routine. 4240 * It is used to tell the VM system that paging I/O is in 4241 * progress, and treat the pages associated with the buffer 4242 * almost as being exclusive busy. Also the object paging_in_progress 4243 * flag is handled to make sure that the object doesn't become 4244 * inconsistent. 4245 * 4246 * Since I/O has not been initiated yet, certain buffer flags 4247 * such as BIO_ERROR or B_INVAL may be in an inconsistent state 4248 * and should be ignored. 4249 */ 4250 void 4251 vfs_busy_pages(struct buf *bp, int clear_modify) 4252 { 4253 vm_object_t obj; 4254 vm_ooffset_t foff; 4255 vm_page_t m; 4256 int i; 4257 bool bogus; 4258 4259 if (!(bp->b_flags & B_VMIO)) 4260 return; 4261 4262 obj = bp->b_bufobj->bo_object; 4263 foff = bp->b_offset; 4264 KASSERT(bp->b_offset != NOOFFSET, 4265 ("vfs_busy_pages: no buffer offset")); 4266 VM_OBJECT_WLOCK(obj); 4267 vfs_drain_busy_pages(bp); 4268 if (bp->b_bufsize != 0) 4269 vfs_setdirty_locked_object(bp); 4270 bogus = false; 4271 for (i = 0; i < bp->b_npages; i++) { 4272 m = bp->b_pages[i]; 4273 4274 if ((bp->b_flags & B_CLUSTER) == 0) { 4275 vm_object_pip_add(obj, 1); 4276 vm_page_sbusy(m); 4277 } 4278 /* 4279 * When readying a buffer for a read ( i.e 4280 * clear_modify == 0 ), it is important to do 4281 * bogus_page replacement for valid pages in 4282 * partially instantiated buffers. Partially 4283 * instantiated buffers can, in turn, occur when 4284 * reconstituting a buffer from its VM backing store 4285 * base. We only have to do this if B_CACHE is 4286 * clear ( which causes the I/O to occur in the 4287 * first place ). The replacement prevents the read 4288 * I/O from overwriting potentially dirty VM-backed 4289 * pages. XXX bogus page replacement is, uh, bogus. 4290 * It may not work properly with small-block devices. 4291 * We need to find a better way. 4292 */ 4293 if (clear_modify) { 4294 pmap_remove_write(m); 4295 vfs_page_set_validclean(bp, foff, m); 4296 } else if (m->valid == VM_PAGE_BITS_ALL && 4297 (bp->b_flags & B_CACHE) == 0) { 4298 bp->b_pages[i] = bogus_page; 4299 bogus = true; 4300 } 4301 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 4302 } 4303 VM_OBJECT_WUNLOCK(obj); 4304 if (bogus && buf_mapped(bp)) { 4305 BUF_CHECK_MAPPED(bp); 4306 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 4307 bp->b_pages, bp->b_npages); 4308 } 4309 } 4310 4311 /* 4312 * vfs_bio_set_valid: 4313 * 4314 * Set the range within the buffer to valid. The range is 4315 * relative to the beginning of the buffer, b_offset. Note that 4316 * b_offset itself may be offset from the beginning of the first 4317 * page. 4318 */ 4319 void 4320 vfs_bio_set_valid(struct buf *bp, int base, int size) 4321 { 4322 int i, n; 4323 vm_page_t m; 4324 4325 if (!(bp->b_flags & B_VMIO)) 4326 return; 4327 4328 /* 4329 * Fixup base to be relative to beginning of first page. 4330 * Set initial n to be the maximum number of bytes in the 4331 * first page that can be validated. 4332 */ 4333 base += (bp->b_offset & PAGE_MASK); 4334 n = PAGE_SIZE - (base & PAGE_MASK); 4335 4336 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 4337 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { 4338 m = bp->b_pages[i]; 4339 if (n > size) 4340 n = size; 4341 vm_page_set_valid_range(m, base & PAGE_MASK, n); 4342 base += n; 4343 size -= n; 4344 n = PAGE_SIZE; 4345 } 4346 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 4347 } 4348 4349 /* 4350 * vfs_bio_clrbuf: 4351 * 4352 * If the specified buffer is a non-VMIO buffer, clear the entire 4353 * buffer. If the specified buffer is a VMIO buffer, clear and 4354 * validate only the previously invalid portions of the buffer. 4355 * This routine essentially fakes an I/O, so we need to clear 4356 * BIO_ERROR and B_INVAL. 4357 * 4358 * Note that while we only theoretically need to clear through b_bcount, 4359 * we go ahead and clear through b_bufsize. 4360 */ 4361 void 4362 vfs_bio_clrbuf(struct buf *bp) 4363 { 4364 int i, j, mask, sa, ea, slide; 4365 4366 if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { 4367 clrbuf(bp); 4368 return; 4369 } 4370 bp->b_flags &= ~B_INVAL; 4371 bp->b_ioflags &= ~BIO_ERROR; 4372 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 4373 if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 4374 (bp->b_offset & PAGE_MASK) == 0) { 4375 if (bp->b_pages[0] == bogus_page) 4376 goto unlock; 4377 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 4378 VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); 4379 if ((bp->b_pages[0]->valid & mask) == mask) 4380 goto unlock; 4381 if ((bp->b_pages[0]->valid & mask) == 0) { 4382 pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); 4383 bp->b_pages[0]->valid |= mask; 4384 goto unlock; 4385 } 4386 } 4387 sa = bp->b_offset & PAGE_MASK; 4388 slide = 0; 4389 for (i = 0; i < bp->b_npages; i++, sa = 0) { 4390 slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); 4391 ea = slide & PAGE_MASK; 4392 if (ea == 0) 4393 ea = PAGE_SIZE; 4394 if (bp->b_pages[i] == bogus_page) 4395 continue; 4396 j = sa / DEV_BSIZE; 4397 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 4398 VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); 4399 if ((bp->b_pages[i]->valid & mask) == mask) 4400 continue; 4401 if ((bp->b_pages[i]->valid & mask) == 0) 4402 pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); 4403 else { 4404 for (; sa < ea; sa += DEV_BSIZE, j++) { 4405 if ((bp->b_pages[i]->valid & (1 << j)) == 0) { 4406 pmap_zero_page_area(bp->b_pages[i], 4407 sa, DEV_BSIZE); 4408 } 4409 } 4410 } 4411 bp->b_pages[i]->valid |= mask; 4412 } 4413 unlock: 4414 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 4415 bp->b_resid = 0; 4416 } 4417 4418 void 4419 vfs_bio_bzero_buf(struct buf *bp, int base, int size) 4420 { 4421 vm_page_t m; 4422 int i, n; 4423 4424 if (buf_mapped(bp)) { 4425 BUF_CHECK_MAPPED(bp); 4426 bzero(bp->b_data + base, size); 4427 } else { 4428 BUF_CHECK_UNMAPPED(bp); 4429 n = PAGE_SIZE - (base & PAGE_MASK); 4430 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { 4431 m = bp->b_pages[i]; 4432 if (n > size) 4433 n = size; 4434 pmap_zero_page_area(m, base & PAGE_MASK, n); 4435 base += n; 4436 size -= n; 4437 n = PAGE_SIZE; 4438 } 4439 } 4440 } 4441 4442 /* 4443 * Update buffer flags based on I/O request parameters, optionally releasing the 4444 * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, 4445 * where they may be placed on a page queue (VMIO) or freed immediately (direct 4446 * I/O). Otherwise the buffer is released to the cache. 4447 */ 4448 static void 4449 b_io_dismiss(struct buf *bp, int ioflag, bool release) 4450 { 4451 4452 KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, 4453 ("buf %p non-VMIO noreuse", bp)); 4454 4455 if ((ioflag & IO_DIRECT) != 0) 4456 bp->b_flags |= B_DIRECT; 4457 if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { 4458 bp->b_flags |= B_RELBUF; 4459 if ((ioflag & IO_NOREUSE) != 0) 4460 bp->b_flags |= B_NOREUSE; 4461 if (release) 4462 brelse(bp); 4463 } else if (release) 4464 bqrelse(bp); 4465 } 4466 4467 void 4468 vfs_bio_brelse(struct buf *bp, int ioflag) 4469 { 4470 4471 b_io_dismiss(bp, ioflag, true); 4472 } 4473 4474 void 4475 vfs_bio_set_flags(struct buf *bp, int ioflag) 4476 { 4477 4478 b_io_dismiss(bp, ioflag, false); 4479 } 4480 4481 /* 4482 * vm_hold_load_pages and vm_hold_free_pages get pages into 4483 * a buffers address space. The pages are anonymous and are 4484 * not associated with a file object. 4485 */ 4486 static void 4487 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 4488 { 4489 vm_offset_t pg; 4490 vm_page_t p; 4491 int index; 4492 4493 BUF_CHECK_MAPPED(bp); 4494 4495 to = round_page(to); 4496 from = round_page(from); 4497 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4498 4499 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 4500 tryagain: 4501 /* 4502 * note: must allocate system pages since blocking here 4503 * could interfere with paging I/O, no matter which 4504 * process we are. 4505 */ 4506 p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | 4507 VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT)); 4508 if (p == NULL) { 4509 VM_WAIT; 4510 goto tryagain; 4511 } 4512 pmap_qenter(pg, &p, 1); 4513 bp->b_pages[index] = p; 4514 } 4515 bp->b_npages = index; 4516 } 4517 4518 /* Return pages associated with this buf to the vm system */ 4519 static void 4520 vm_hold_free_pages(struct buf *bp, int newbsize) 4521 { 4522 vm_offset_t from; 4523 vm_page_t p; 4524 int index, newnpages; 4525 4526 BUF_CHECK_MAPPED(bp); 4527 4528 from = round_page((vm_offset_t)bp->b_data + newbsize); 4529 newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4530 if (bp->b_npages > newnpages) 4531 pmap_qremove(from, bp->b_npages - newnpages); 4532 for (index = newnpages; index < bp->b_npages; index++) { 4533 p = bp->b_pages[index]; 4534 bp->b_pages[index] = NULL; 4535 p->wire_count--; 4536 vm_page_free(p); 4537 } 4538 atomic_subtract_int(&vm_cnt.v_wire_count, bp->b_npages - newnpages); 4539 bp->b_npages = newnpages; 4540 } 4541 4542 /* 4543 * Map an IO request into kernel virtual address space. 4544 * 4545 * All requests are (re)mapped into kernel VA space. 4546 * Notice that we use b_bufsize for the size of the buffer 4547 * to be mapped. b_bcount might be modified by the driver. 4548 * 4549 * Note that even if the caller determines that the address space should 4550 * be valid, a race or a smaller-file mapped into a larger space may 4551 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST 4552 * check the return value. 4553 * 4554 * This function only works with pager buffers. 4555 */ 4556 int 4557 vmapbuf(struct buf *bp, int mapbuf) 4558 { 4559 vm_prot_t prot; 4560 int pidx; 4561 4562 if (bp->b_bufsize < 0) 4563 return (-1); 4564 prot = VM_PROT_READ; 4565 if (bp->b_iocmd == BIO_READ) 4566 prot |= VM_PROT_WRITE; /* Less backwards than it looks */ 4567 if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, 4568 (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, 4569 btoc(MAXPHYS))) < 0) 4570 return (-1); 4571 bp->b_npages = pidx; 4572 bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; 4573 if (mapbuf || !unmapped_buf_allowed) { 4574 pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); 4575 bp->b_data = bp->b_kvabase + bp->b_offset; 4576 } else 4577 bp->b_data = unmapped_buf; 4578 return(0); 4579 } 4580 4581 /* 4582 * Free the io map PTEs associated with this IO operation. 4583 * We also invalidate the TLB entries and restore the original b_addr. 4584 * 4585 * This function only works with pager buffers. 4586 */ 4587 void 4588 vunmapbuf(struct buf *bp) 4589 { 4590 int npages; 4591 4592 npages = bp->b_npages; 4593 if (buf_mapped(bp)) 4594 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); 4595 vm_page_unhold_pages(bp->b_pages, npages); 4596 4597 bp->b_data = unmapped_buf; 4598 } 4599 4600 void 4601 bdone(struct buf *bp) 4602 { 4603 struct mtx *mtxp; 4604 4605 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4606 mtx_lock(mtxp); 4607 bp->b_flags |= B_DONE; 4608 wakeup(bp); 4609 mtx_unlock(mtxp); 4610 } 4611 4612 void 4613 bwait(struct buf *bp, u_char pri, const char *wchan) 4614 { 4615 struct mtx *mtxp; 4616 4617 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4618 mtx_lock(mtxp); 4619 while ((bp->b_flags & B_DONE) == 0) 4620 msleep(bp, mtxp, pri, wchan, 0); 4621 mtx_unlock(mtxp); 4622 } 4623 4624 int 4625 bufsync(struct bufobj *bo, int waitfor) 4626 { 4627 4628 return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); 4629 } 4630 4631 void 4632 bufstrategy(struct bufobj *bo, struct buf *bp) 4633 { 4634 int i = 0; 4635 struct vnode *vp; 4636 4637 vp = bp->b_vp; 4638 KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); 4639 KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, 4640 ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); 4641 i = VOP_STRATEGY(vp, bp); 4642 KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); 4643 } 4644 4645 void 4646 bufobj_wrefl(struct bufobj *bo) 4647 { 4648 4649 KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); 4650 ASSERT_BO_WLOCKED(bo); 4651 bo->bo_numoutput++; 4652 } 4653 4654 void 4655 bufobj_wref(struct bufobj *bo) 4656 { 4657 4658 KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); 4659 BO_LOCK(bo); 4660 bo->bo_numoutput++; 4661 BO_UNLOCK(bo); 4662 } 4663 4664 void 4665 bufobj_wdrop(struct bufobj *bo) 4666 { 4667 4668 KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); 4669 BO_LOCK(bo); 4670 KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); 4671 if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { 4672 bo->bo_flag &= ~BO_WWAIT; 4673 wakeup(&bo->bo_numoutput); 4674 } 4675 BO_UNLOCK(bo); 4676 } 4677 4678 int 4679 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) 4680 { 4681 int error; 4682 4683 KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); 4684 ASSERT_BO_WLOCKED(bo); 4685 error = 0; 4686 while (bo->bo_numoutput) { 4687 bo->bo_flag |= BO_WWAIT; 4688 error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), 4689 slpflag | (PRIBIO + 1), "bo_wwait", timeo); 4690 if (error) 4691 break; 4692 } 4693 return (error); 4694 } 4695 4696 /* 4697 * Set bio_data or bio_ma for struct bio from the struct buf. 4698 */ 4699 void 4700 bdata2bio(struct buf *bp, struct bio *bip) 4701 { 4702 4703 if (!buf_mapped(bp)) { 4704 KASSERT(unmapped_buf_allowed, ("unmapped")); 4705 bip->bio_ma = bp->b_pages; 4706 bip->bio_ma_n = bp->b_npages; 4707 bip->bio_data = unmapped_buf; 4708 bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; 4709 bip->bio_flags |= BIO_UNMAPPED; 4710 KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / 4711 PAGE_SIZE == bp->b_npages, 4712 ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, 4713 (long long)bip->bio_length, bip->bio_ma_n)); 4714 } else { 4715 bip->bio_data = bp->b_data; 4716 bip->bio_ma = NULL; 4717 } 4718 } 4719 4720 /* 4721 * The MIPS pmap code currently doesn't handle aliased pages. 4722 * The VIPT caches may not handle page aliasing themselves, leading 4723 * to data corruption. 4724 * 4725 * As such, this code makes a system extremely unhappy if said 4726 * system doesn't support unaliasing the above situation in hardware. 4727 * Some "recent" systems (eg some mips24k/mips74k cores) don't enable 4728 * this feature at build time, so it has to be handled in software. 4729 * 4730 * Once the MIPS pmap/cache code grows to support this function on 4731 * earlier chips, it should be flipped back off. 4732 */ 4733 #ifdef __mips__ 4734 static int buf_pager_relbuf = 1; 4735 #else 4736 static int buf_pager_relbuf = 0; 4737 #endif 4738 SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, 4739 &buf_pager_relbuf, 0, 4740 "Make buffer pager release buffers after reading"); 4741 4742 /* 4743 * The buffer pager. It uses buffer reads to validate pages. 4744 * 4745 * In contrast to the generic local pager from vm/vnode_pager.c, this 4746 * pager correctly and easily handles volumes where the underlying 4747 * device block size is greater than the machine page size. The 4748 * buffer cache transparently extends the requested page run to be 4749 * aligned at the block boundary, and does the necessary bogus page 4750 * replacements in the addends to avoid obliterating already valid 4751 * pages. 4752 * 4753 * The only non-trivial issue is that the exclusive busy state for 4754 * pages, which is assumed by the vm_pager_getpages() interface, is 4755 * incompatible with the VMIO buffer cache's desire to share-busy the 4756 * pages. This function performs a trivial downgrade of the pages' 4757 * state before reading buffers, and a less trivial upgrade from the 4758 * shared-busy to excl-busy state after the read. 4759 */ 4760 int 4761 vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, 4762 int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, 4763 vbg_get_blksize_t get_blksize) 4764 { 4765 vm_page_t m; 4766 vm_object_t object; 4767 struct buf *bp; 4768 struct mount *mp; 4769 daddr_t lbn, lbnp; 4770 vm_ooffset_t la, lb, poff, poffe; 4771 long bsize; 4772 int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b; 4773 bool redo, lpart; 4774 4775 object = vp->v_object; 4776 mp = vp->v_mount; 4777 la = IDX_TO_OFF(ma[count - 1]->pindex); 4778 if (la >= object->un_pager.vnp.vnp_size) 4779 return (VM_PAGER_BAD); 4780 lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size; 4781 bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex))); 4782 4783 /* 4784 * Calculate read-ahead, behind and total pages. 4785 */ 4786 pgsin = count; 4787 lb = IDX_TO_OFF(ma[0]->pindex); 4788 pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); 4789 pgsin += pgsin_b; 4790 if (rbehind != NULL) 4791 *rbehind = pgsin_b; 4792 pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); 4793 if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) 4794 pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, 4795 PAGE_SIZE) - la); 4796 pgsin += pgsin_a; 4797 if (rahead != NULL) 4798 *rahead = pgsin_a; 4799 VM_CNT_INC(v_vnodein); 4800 VM_CNT_ADD(v_vnodepgsin, pgsin); 4801 4802 br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) 4803 != 0) ? GB_UNMAPPED : 0; 4804 VM_OBJECT_WLOCK(object); 4805 again: 4806 for (i = 0; i < count; i++) 4807 vm_page_busy_downgrade(ma[i]); 4808 VM_OBJECT_WUNLOCK(object); 4809 4810 lbnp = -1; 4811 for (i = 0; i < count; i++) { 4812 m = ma[i]; 4813 4814 /* 4815 * Pages are shared busy and the object lock is not 4816 * owned, which together allow for the pages' 4817 * invalidation. The racy test for validity avoids 4818 * useless creation of the buffer for the most typical 4819 * case when invalidation is not used in redo or for 4820 * parallel read. The shared->excl upgrade loop at 4821 * the end of the function catches the race in a 4822 * reliable way (protected by the object lock). 4823 */ 4824 if (m->valid == VM_PAGE_BITS_ALL) 4825 continue; 4826 4827 poff = IDX_TO_OFF(m->pindex); 4828 poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); 4829 for (; poff < poffe; poff += bsize) { 4830 lbn = get_lblkno(vp, poff); 4831 if (lbn == lbnp) 4832 goto next_page; 4833 lbnp = lbn; 4834 4835 bsize = get_blksize(vp, lbn); 4836 error = bread_gb(vp, lbn, bsize, curthread->td_ucred, 4837 br_flags, &bp); 4838 if (error != 0) 4839 goto end_pages; 4840 if (LIST_EMPTY(&bp->b_dep)) { 4841 /* 4842 * Invalidation clears m->valid, but 4843 * may leave B_CACHE flag if the 4844 * buffer existed at the invalidation 4845 * time. In this case, recycle the 4846 * buffer to do real read on next 4847 * bread() after redo. 4848 * 4849 * Otherwise B_RELBUF is not strictly 4850 * necessary, enable to reduce buf 4851 * cache pressure. 4852 */ 4853 if (buf_pager_relbuf || 4854 m->valid != VM_PAGE_BITS_ALL) 4855 bp->b_flags |= B_RELBUF; 4856 4857 bp->b_flags &= ~B_NOCACHE; 4858 brelse(bp); 4859 } else { 4860 bqrelse(bp); 4861 } 4862 } 4863 KASSERT(1 /* racy, enable for debugging */ || 4864 m->valid == VM_PAGE_BITS_ALL || i == count - 1, 4865 ("buf %d %p invalid", i, m)); 4866 if (i == count - 1 && lpart) { 4867 VM_OBJECT_WLOCK(object); 4868 if (m->valid != 0 && 4869 m->valid != VM_PAGE_BITS_ALL) 4870 vm_page_zero_invalid(m, TRUE); 4871 VM_OBJECT_WUNLOCK(object); 4872 } 4873 next_page:; 4874 } 4875 end_pages: 4876 4877 VM_OBJECT_WLOCK(object); 4878 redo = false; 4879 for (i = 0; i < count; i++) { 4880 vm_page_sunbusy(ma[i]); 4881 ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); 4882 4883 /* 4884 * Since the pages were only sbusy while neither the 4885 * buffer nor the object lock was held by us, or 4886 * reallocated while vm_page_grab() slept for busy 4887 * relinguish, they could have been invalidated. 4888 * Recheck the valid bits and re-read as needed. 4889 * 4890 * Note that the last page is made fully valid in the 4891 * read loop, and partial validity for the page at 4892 * index count - 1 could mean that the page was 4893 * invalidated or removed, so we must restart for 4894 * safety as well. 4895 */ 4896 if (ma[i]->valid != VM_PAGE_BITS_ALL) 4897 redo = true; 4898 } 4899 if (redo && error == 0) 4900 goto again; 4901 VM_OBJECT_WUNLOCK(object); 4902 return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); 4903 } 4904 4905 #include "opt_ddb.h" 4906 #ifdef DDB 4907 #include <ddb/ddb.h> 4908 4909 /* DDB command to show buffer data */ 4910 DB_SHOW_COMMAND(buffer, db_show_buffer) 4911 { 4912 /* get args */ 4913 struct buf *bp = (struct buf *)addr; 4914 #ifdef FULL_BUF_TRACKING 4915 uint32_t i, j; 4916 #endif 4917 4918 if (!have_addr) { 4919 db_printf("usage: show buffer <addr>\n"); 4920 return; 4921 } 4922 4923 db_printf("buf at %p\n", bp); 4924 db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", 4925 (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, 4926 PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); 4927 db_printf( 4928 "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" 4929 "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " 4930 "b_dep = %p\n", 4931 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 4932 bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, 4933 (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); 4934 db_printf("b_kvabase = %p, b_kvasize = %d\n", 4935 bp->b_kvabase, bp->b_kvasize); 4936 if (bp->b_npages) { 4937 int i; 4938 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); 4939 for (i = 0; i < bp->b_npages; i++) { 4940 vm_page_t m; 4941 m = bp->b_pages[i]; 4942 if (m != NULL) 4943 db_printf("(%p, 0x%lx, 0x%lx)", m->object, 4944 (u_long)m->pindex, 4945 (u_long)VM_PAGE_TO_PHYS(m)); 4946 else 4947 db_printf("( ??? )"); 4948 if ((i + 1) < bp->b_npages) 4949 db_printf(","); 4950 } 4951 db_printf("\n"); 4952 } 4953 #if defined(FULL_BUF_TRACKING) 4954 db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); 4955 4956 i = bp->b_io_tcnt % BUF_TRACKING_SIZE; 4957 for (j = 1; j <= BUF_TRACKING_SIZE; j++) { 4958 if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL) 4959 continue; 4960 db_printf(" %2u: %s\n", j, 4961 bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); 4962 } 4963 #elif defined(BUF_TRACKING) 4964 db_printf("b_io_tracking: %s\n", bp->b_io_tracking); 4965 #endif 4966 db_printf(" "); 4967 BUF_LOCKPRINTINFO(bp); 4968 } 4969 4970 DB_SHOW_COMMAND(lockedbufs, lockedbufs) 4971 { 4972 struct buf *bp; 4973 int i; 4974 4975 for (i = 0; i < nbuf; i++) { 4976 bp = &buf[i]; 4977 if (BUF_ISLOCKED(bp)) { 4978 db_show_buffer((uintptr_t)bp, 1, 0, NULL); 4979 db_printf("\n"); 4980 if (db_pager_quit) 4981 break; 4982 } 4983 } 4984 } 4985 4986 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) 4987 { 4988 struct vnode *vp; 4989 struct buf *bp; 4990 4991 if (!have_addr) { 4992 db_printf("usage: show vnodebufs <addr>\n"); 4993 return; 4994 } 4995 vp = (struct vnode *)addr; 4996 db_printf("Clean buffers:\n"); 4997 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { 4998 db_show_buffer((uintptr_t)bp, 1, 0, NULL); 4999 db_printf("\n"); 5000 } 5001 db_printf("Dirty buffers:\n"); 5002 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { 5003 db_show_buffer((uintptr_t)bp, 1, 0, NULL); 5004 db_printf("\n"); 5005 } 5006 } 5007 5008 DB_COMMAND(countfreebufs, db_coundfreebufs) 5009 { 5010 struct buf *bp; 5011 int i, used = 0, nfree = 0; 5012 5013 if (have_addr) { 5014 db_printf("usage: countfreebufs\n"); 5015 return; 5016 } 5017 5018 for (i = 0; i < nbuf; i++) { 5019 bp = &buf[i]; 5020 if (bp->b_qindex == QUEUE_EMPTY) 5021 nfree++; 5022 else 5023 used++; 5024 } 5025 5026 db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, 5027 nfree + used); 5028 db_printf("numfreebuffers is %d\n", numfreebuffers); 5029 } 5030 #endif /* DDB */ 5031