1 /*- 2 * Copyright (c) 2004 Poul-Henning Kamp 3 * Copyright (c) 1994,1997 John S. Dyson 4 * Copyright (c) 2013 The FreeBSD Foundation 5 * All rights reserved. 6 * 7 * Portions of this software were developed by Konstantin Belousov 8 * under sponsorship from the FreeBSD Foundation. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * this file contains a new buffer I/O scheme implementing a coherent 34 * VM object and buffer cache scheme. Pains have been taken to make 35 * sure that the performance degradation associated with schemes such 36 * as this is not realized. 37 * 38 * Author: John S. Dyson 39 * Significant help during the development and debugging phases 40 * had been provided by David Greenman, also of the FreeBSD core team. 41 * 42 * see man buf(9) for more info. 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/conf.h> 52 #include <sys/buf.h> 53 #include <sys/devicestat.h> 54 #include <sys/eventhandler.h> 55 #include <sys/fail.h> 56 #include <sys/limits.h> 57 #include <sys/lock.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/mutex.h> 61 #include <sys/kernel.h> 62 #include <sys/kthread.h> 63 #include <sys/proc.h> 64 #include <sys/resourcevar.h> 65 #include <sys/rwlock.h> 66 #include <sys/sysctl.h> 67 #include <sys/vmem.h> 68 #include <sys/vmmeter.h> 69 #include <sys/vnode.h> 70 #include <geom/geom.h> 71 #include <vm/vm.h> 72 #include <vm/vm_param.h> 73 #include <vm/vm_kern.h> 74 #include <vm/vm_pageout.h> 75 #include <vm/vm_page.h> 76 #include <vm/vm_object.h> 77 #include <vm/vm_extern.h> 78 #include <vm/vm_map.h> 79 #include "opt_compat.h" 80 #include "opt_directio.h" 81 #include "opt_swap.h" 82 83 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); 84 85 struct bio_ops bioops; /* I/O operation notification */ 86 87 struct buf_ops buf_ops_bio = { 88 .bop_name = "buf_ops_bio", 89 .bop_write = bufwrite, 90 .bop_strategy = bufstrategy, 91 .bop_sync = bufsync, 92 .bop_bdflush = bufbdflush, 93 }; 94 95 /* 96 * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has 97 * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. 98 */ 99 struct buf *buf; /* buffer header pool */ 100 caddr_t unmapped_buf; 101 102 static struct proc *bufdaemonproc; 103 104 static int inmem(struct vnode *vp, daddr_t blkno); 105 static void vm_hold_free_pages(struct buf *bp, int newbsize); 106 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, 107 vm_offset_t to); 108 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); 109 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, 110 vm_page_t m); 111 static void vfs_drain_busy_pages(struct buf *bp); 112 static void vfs_clean_pages_dirty_buf(struct buf *bp); 113 static void vfs_setdirty_locked_object(struct buf *bp); 114 static void vfs_vmio_release(struct buf *bp); 115 static int vfs_bio_clcheck(struct vnode *vp, int size, 116 daddr_t lblkno, daddr_t blkno); 117 static int buf_flush(struct vnode *vp, int); 118 static int flushbufqueues(struct vnode *, int, int); 119 static void buf_daemon(void); 120 static void bremfreel(struct buf *bp); 121 static __inline void bd_wakeup(void); 122 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ 123 defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) 124 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); 125 #endif 126 127 int vmiodirenable = TRUE; 128 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, 129 "Use the VM system for directory writes"); 130 long runningbufspace; 131 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, 132 "Amount of presently outstanding async buffer io"); 133 static long bufspace; 134 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ 135 defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) 136 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, 137 &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); 138 #else 139 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, 140 "Virtual memory used for buffers"); 141 #endif 142 static long unmapped_bufspace; 143 SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD, 144 &unmapped_bufspace, 0, 145 "Amount of unmapped buffers, inclusive in the bufspace"); 146 static long maxbufspace; 147 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, 148 "Maximum allowed value of bufspace (including buf_daemon)"); 149 static long bufmallocspace; 150 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, 151 "Amount of malloced memory for buffers"); 152 static long maxbufmallocspace; 153 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, 154 "Maximum amount of malloced memory for buffers"); 155 static long lobufspace; 156 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, 157 "Minimum amount of buffers we want to have"); 158 long hibufspace; 159 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, 160 "Maximum allowed value of bufspace (excluding buf_daemon)"); 161 static int bufreusecnt; 162 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, 163 "Number of times we have reused a buffer"); 164 static int buffreekvacnt; 165 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, 166 "Number of times we have freed the KVA space from some buffer"); 167 static int bufdefragcnt; 168 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, 169 "Number of times we have had to repeat buffer allocation to defragment"); 170 static long lorunningspace; 171 SYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, 172 "Minimum preferred space used for in-progress I/O"); 173 static long hirunningspace; 174 SYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, 175 "Maximum amount of space to use for in-progress I/O"); 176 int dirtybufferflushes; 177 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 178 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); 179 int bdwriteskip; 180 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 181 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); 182 int altbufferflushes; 183 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 184 0, "Number of fsync flushes to limit dirty buffers"); 185 static int recursiveflushes; 186 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 187 0, "Number of flushes skipped due to being recursive"); 188 static int numdirtybuffers; 189 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, 190 "Number of buffers that are dirty (has unwritten changes) at the moment"); 191 static int lodirtybuffers; 192 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, 193 "How many buffers we want to have free before bufdaemon can sleep"); 194 static int hidirtybuffers; 195 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, 196 "When the number of dirty buffers is considered severe"); 197 int dirtybufthresh; 198 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 199 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); 200 static int numfreebuffers; 201 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, 202 "Number of free buffers"); 203 static int lofreebuffers; 204 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, 205 "XXX Unused"); 206 static int hifreebuffers; 207 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, 208 "XXX Complicatedly unused"); 209 static int getnewbufcalls; 210 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, 211 "Number of calls to getnewbuf"); 212 static int getnewbufrestarts; 213 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, 214 "Number of times getnewbuf has had to restart a buffer aquisition"); 215 static int mappingrestarts; 216 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, 217 "Number of times getblk has had to restart a buffer mapping for " 218 "unmapped buffer"); 219 static int flushbufqtarget = 100; 220 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, 221 "Amount of work to do in flushbufqueues when helping bufdaemon"); 222 static long notbufdflushes; 223 SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, 224 "Number of dirty buffer flushes done by the bufdaemon helpers"); 225 static long barrierwrites; 226 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, 227 "Number of barrier writes"); 228 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, 229 &unmapped_buf_allowed, 0, 230 "Permit the use of the unmapped i/o"); 231 232 /* 233 * Lock for the non-dirty bufqueues 234 */ 235 static struct mtx_padalign bqclean; 236 237 /* 238 * Lock for the dirty queue. 239 */ 240 static struct mtx_padalign bqdirty; 241 242 /* 243 * This lock synchronizes access to bd_request. 244 */ 245 static struct mtx_padalign bdlock; 246 247 /* 248 * This lock protects the runningbufreq and synchronizes runningbufwakeup and 249 * waitrunningbufspace(). 250 */ 251 static struct mtx_padalign rbreqlock; 252 253 /* 254 * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. 255 */ 256 static struct mtx_padalign nblock; 257 258 /* 259 * Lock that protects bdirtywait. 260 */ 261 static struct mtx_padalign bdirtylock; 262 263 /* 264 * Wakeup point for bufdaemon, as well as indicator of whether it is already 265 * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it 266 * is idling. 267 */ 268 static int bd_request; 269 270 /* 271 * Request for the buf daemon to write more buffers than is indicated by 272 * lodirtybuf. This may be necessary to push out excess dependencies or 273 * defragment the address space where a simple count of the number of dirty 274 * buffers is insufficient to characterize the demand for flushing them. 275 */ 276 static int bd_speedupreq; 277 278 /* 279 * bogus page -- for I/O to/from partially complete buffers 280 * this is a temporary solution to the problem, but it is not 281 * really that bad. it would be better to split the buffer 282 * for input in the case of buffers partially already in memory, 283 * but the code is intricate enough already. 284 */ 285 vm_page_t bogus_page; 286 287 /* 288 * Synchronization (sleep/wakeup) variable for active buffer space requests. 289 * Set when wait starts, cleared prior to wakeup(). 290 * Used in runningbufwakeup() and waitrunningbufspace(). 291 */ 292 static int runningbufreq; 293 294 /* 295 * Synchronization (sleep/wakeup) variable for buffer requests. 296 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done 297 * by and/or. 298 * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(), 299 * getnewbuf(), and getblk(). 300 */ 301 static int needsbuffer; 302 303 /* 304 * Synchronization for bwillwrite() waiters. 305 */ 306 static int bdirtywait; 307 308 /* 309 * Definitions for the buffer free lists. 310 */ 311 #define BUFFER_QUEUES 5 /* number of free buffer queues */ 312 313 #define QUEUE_NONE 0 /* on no queue */ 314 #define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ 315 #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ 316 #define QUEUE_EMPTYKVA 3 /* empty buffer headers w/KVA assignment */ 317 #define QUEUE_EMPTY 4 /* empty buffer headers */ 318 #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ 319 320 /* Queues for free buffers with various properties */ 321 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; 322 #ifdef INVARIANTS 323 static int bq_len[BUFFER_QUEUES]; 324 #endif 325 326 /* 327 * Single global constant for BUF_WMESG, to avoid getting multiple references. 328 * buf_wmesg is referred from macros. 329 */ 330 const char *buf_wmesg = BUF_WMESG; 331 332 #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ 333 #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ 334 #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ 335 336 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ 337 defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) 338 static int 339 sysctl_bufspace(SYSCTL_HANDLER_ARGS) 340 { 341 long lvalue; 342 int ivalue; 343 344 if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) 345 return (sysctl_handle_long(oidp, arg1, arg2, req)); 346 lvalue = *(long *)arg1; 347 if (lvalue > INT_MAX) 348 /* On overflow, still write out a long to trigger ENOMEM. */ 349 return (sysctl_handle_long(oidp, &lvalue, 0, req)); 350 ivalue = lvalue; 351 return (sysctl_handle_int(oidp, &ivalue, 0, req)); 352 } 353 #endif 354 355 #ifdef DIRECTIO 356 extern void ffs_rawread_setup(void); 357 #endif /* DIRECTIO */ 358 359 /* 360 * bqlock: 361 * 362 * Return the appropriate queue lock based on the index. 363 */ 364 static inline struct mtx * 365 bqlock(int qindex) 366 { 367 368 if (qindex == QUEUE_DIRTY) 369 return (struct mtx *)(&bqdirty); 370 return (struct mtx *)(&bqclean); 371 } 372 373 /* 374 * bdirtywakeup: 375 * 376 * Wakeup any bwillwrite() waiters. 377 */ 378 static void 379 bdirtywakeup(void) 380 { 381 mtx_lock(&bdirtylock); 382 if (bdirtywait) { 383 bdirtywait = 0; 384 wakeup(&bdirtywait); 385 } 386 mtx_unlock(&bdirtylock); 387 } 388 389 /* 390 * bdirtysub: 391 * 392 * Decrement the numdirtybuffers count by one and wakeup any 393 * threads blocked in bwillwrite(). 394 */ 395 static void 396 bdirtysub(void) 397 { 398 399 if (atomic_fetchadd_int(&numdirtybuffers, -1) == 400 (lodirtybuffers + hidirtybuffers) / 2) 401 bdirtywakeup(); 402 } 403 404 /* 405 * bdirtyadd: 406 * 407 * Increment the numdirtybuffers count by one and wakeup the buf 408 * daemon if needed. 409 */ 410 static void 411 bdirtyadd(void) 412 { 413 414 /* 415 * Only do the wakeup once as we cross the boundary. The 416 * buf daemon will keep running until the condition clears. 417 */ 418 if (atomic_fetchadd_int(&numdirtybuffers, 1) == 419 (lodirtybuffers + hidirtybuffers) / 2) 420 bd_wakeup(); 421 } 422 423 /* 424 * bufspacewakeup: 425 * 426 * Called when buffer space is potentially available for recovery. 427 * getnewbuf() will block on this flag when it is unable to free 428 * sufficient buffer space. Buffer space becomes recoverable when 429 * bp's get placed back in the queues. 430 */ 431 432 static __inline void 433 bufspacewakeup(void) 434 { 435 436 /* 437 * If someone is waiting for BUF space, wake them up. Even 438 * though we haven't freed the kva space yet, the waiting 439 * process will be able to now. 440 */ 441 mtx_lock(&nblock); 442 if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { 443 needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; 444 wakeup(&needsbuffer); 445 } 446 mtx_unlock(&nblock); 447 } 448 449 /* 450 * runningwakeup: 451 * 452 * Wake up processes that are waiting on asynchronous writes to fall 453 * below lorunningspace. 454 */ 455 static void 456 runningwakeup(void) 457 { 458 459 mtx_lock(&rbreqlock); 460 if (runningbufreq) { 461 runningbufreq = 0; 462 wakeup(&runningbufreq); 463 } 464 mtx_unlock(&rbreqlock); 465 } 466 467 /* 468 * runningbufwakeup: 469 * 470 * Decrement the outstanding write count according. 471 */ 472 void 473 runningbufwakeup(struct buf *bp) 474 { 475 long space, bspace; 476 477 bspace = bp->b_runningbufspace; 478 if (bspace == 0) 479 return; 480 space = atomic_fetchadd_long(&runningbufspace, -bspace); 481 KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", 482 space, bspace)); 483 bp->b_runningbufspace = 0; 484 /* 485 * Only acquire the lock and wakeup on the transition from exceeding 486 * the threshold to falling below it. 487 */ 488 if (space < lorunningspace) 489 return; 490 if (space - bspace > lorunningspace) 491 return; 492 runningwakeup(); 493 } 494 495 /* 496 * bufcountadd: 497 * 498 * Called when a buffer has been added to one of the free queues to 499 * account for the buffer and to wakeup anyone waiting for free buffers. 500 * This typically occurs when large amounts of metadata are being handled 501 * by the buffer cache ( else buffer space runs out first, usually ). 502 */ 503 static __inline void 504 bufcountadd(struct buf *bp) 505 { 506 int old; 507 508 KASSERT((bp->b_flags & B_INFREECNT) == 0, 509 ("buf %p already counted as free", bp)); 510 bp->b_flags |= B_INFREECNT; 511 old = atomic_fetchadd_int(&numfreebuffers, 1); 512 KASSERT(old >= 0 && old < nbuf, 513 ("numfreebuffers climbed to %d", old + 1)); 514 mtx_lock(&nblock); 515 if (needsbuffer) { 516 needsbuffer &= ~VFS_BIO_NEED_ANY; 517 if (numfreebuffers >= hifreebuffers) 518 needsbuffer &= ~VFS_BIO_NEED_FREE; 519 wakeup(&needsbuffer); 520 } 521 mtx_unlock(&nblock); 522 } 523 524 /* 525 * bufcountsub: 526 * 527 * Decrement the numfreebuffers count as needed. 528 */ 529 static void 530 bufcountsub(struct buf *bp) 531 { 532 int old; 533 534 /* 535 * Fixup numfreebuffers count. If the buffer is invalid or not 536 * delayed-write, the buffer was free and we must decrement 537 * numfreebuffers. 538 */ 539 if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { 540 KASSERT((bp->b_flags & B_INFREECNT) != 0, 541 ("buf %p not counted in numfreebuffers", bp)); 542 bp->b_flags &= ~B_INFREECNT; 543 old = atomic_fetchadd_int(&numfreebuffers, -1); 544 KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1)); 545 } 546 } 547 548 /* 549 * waitrunningbufspace() 550 * 551 * runningbufspace is a measure of the amount of I/O currently 552 * running. This routine is used in async-write situations to 553 * prevent creating huge backups of pending writes to a device. 554 * Only asynchronous writes are governed by this function. 555 * 556 * This does NOT turn an async write into a sync write. It waits 557 * for earlier writes to complete and generally returns before the 558 * caller's write has reached the device. 559 */ 560 void 561 waitrunningbufspace(void) 562 { 563 564 mtx_lock(&rbreqlock); 565 while (runningbufspace > hirunningspace) { 566 runningbufreq = 1; 567 msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); 568 } 569 mtx_unlock(&rbreqlock); 570 } 571 572 573 /* 574 * vfs_buf_test_cache: 575 * 576 * Called when a buffer is extended. This function clears the B_CACHE 577 * bit if the newly extended portion of the buffer does not contain 578 * valid data. 579 */ 580 static __inline 581 void 582 vfs_buf_test_cache(struct buf *bp, 583 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 584 vm_page_t m) 585 { 586 587 VM_OBJECT_ASSERT_WLOCKED(m->object); 588 if (bp->b_flags & B_CACHE) { 589 int base = (foff + off) & PAGE_MASK; 590 if (vm_page_is_valid(m, base, size) == 0) 591 bp->b_flags &= ~B_CACHE; 592 } 593 } 594 595 /* Wake up the buffer daemon if necessary */ 596 static __inline void 597 bd_wakeup(void) 598 { 599 600 mtx_lock(&bdlock); 601 if (bd_request == 0) { 602 bd_request = 1; 603 wakeup(&bd_request); 604 } 605 mtx_unlock(&bdlock); 606 } 607 608 /* 609 * bd_speedup - speedup the buffer cache flushing code 610 */ 611 void 612 bd_speedup(void) 613 { 614 int needwake; 615 616 mtx_lock(&bdlock); 617 needwake = 0; 618 if (bd_speedupreq == 0 || bd_request == 0) 619 needwake = 1; 620 bd_speedupreq = 1; 621 bd_request = 1; 622 if (needwake) 623 wakeup(&bd_request); 624 mtx_unlock(&bdlock); 625 } 626 627 #ifdef __i386__ 628 #define TRANSIENT_DENOM 5 629 #else 630 #define TRANSIENT_DENOM 10 631 #endif 632 633 /* 634 * Calculating buffer cache scaling values and reserve space for buffer 635 * headers. This is called during low level kernel initialization and 636 * may be called more then once. We CANNOT write to the memory area 637 * being reserved at this time. 638 */ 639 caddr_t 640 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) 641 { 642 int tuned_nbuf; 643 long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; 644 645 /* 646 * physmem_est is in pages. Convert it to kilobytes (assumes 647 * PAGE_SIZE is >= 1K) 648 */ 649 physmem_est = physmem_est * (PAGE_SIZE / 1024); 650 651 /* 652 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 653 * For the first 64MB of ram nominally allocate sufficient buffers to 654 * cover 1/4 of our ram. Beyond the first 64MB allocate additional 655 * buffers to cover 1/10 of our ram over 64MB. When auto-sizing 656 * the buffer cache we limit the eventual kva reservation to 657 * maxbcache bytes. 658 * 659 * factor represents the 1/4 x ram conversion. 660 */ 661 if (nbuf == 0) { 662 int factor = 4 * BKVASIZE / 1024; 663 664 nbuf = 50; 665 if (physmem_est > 4096) 666 nbuf += min((physmem_est - 4096) / factor, 667 65536 / factor); 668 if (physmem_est > 65536) 669 nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 670 32 * 1024 * 1024 / (factor * 5)); 671 672 if (maxbcache && nbuf > maxbcache / BKVASIZE) 673 nbuf = maxbcache / BKVASIZE; 674 tuned_nbuf = 1; 675 } else 676 tuned_nbuf = 0; 677 678 /* XXX Avoid unsigned long overflows later on with maxbufspace. */ 679 maxbuf = (LONG_MAX / 3) / BKVASIZE; 680 if (nbuf > maxbuf) { 681 if (!tuned_nbuf) 682 printf("Warning: nbufs lowered from %d to %ld\n", nbuf, 683 maxbuf); 684 nbuf = maxbuf; 685 } 686 687 /* 688 * Ideal allocation size for the transient bio submap if 10% 689 * of the maximal space buffer map. This roughly corresponds 690 * to the amount of the buffer mapped for typical UFS load. 691 * 692 * Clip the buffer map to reserve space for the transient 693 * BIOs, if its extent is bigger than 90% (80% on i386) of the 694 * maximum buffer map extent on the platform. 695 * 696 * The fall-back to the maxbuf in case of maxbcache unset, 697 * allows to not trim the buffer KVA for the architectures 698 * with ample KVA space. 699 */ 700 if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { 701 maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; 702 buf_sz = (long)nbuf * BKVASIZE; 703 if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * 704 (TRANSIENT_DENOM - 1)) { 705 /* 706 * There is more KVA than memory. Do not 707 * adjust buffer map size, and assign the rest 708 * of maxbuf to transient map. 709 */ 710 biotmap_sz = maxbuf_sz - buf_sz; 711 } else { 712 /* 713 * Buffer map spans all KVA we could afford on 714 * this platform. Give 10% (20% on i386) of 715 * the buffer map to the transient bio map. 716 */ 717 biotmap_sz = buf_sz / TRANSIENT_DENOM; 718 buf_sz -= biotmap_sz; 719 } 720 if (biotmap_sz / INT_MAX > MAXPHYS) 721 bio_transient_maxcnt = INT_MAX; 722 else 723 bio_transient_maxcnt = biotmap_sz / MAXPHYS; 724 /* 725 * Artifically limit to 1024 simultaneous in-flight I/Os 726 * using the transient mapping. 727 */ 728 if (bio_transient_maxcnt > 1024) 729 bio_transient_maxcnt = 1024; 730 if (tuned_nbuf) 731 nbuf = buf_sz / BKVASIZE; 732 } 733 734 /* 735 * swbufs are used as temporary holders for I/O, such as paging I/O. 736 * We have no less then 16 and no more then 256. 737 */ 738 nswbuf = max(min(nbuf/4, 256), 16); 739 #ifdef NSWBUF_MIN 740 if (nswbuf < NSWBUF_MIN) 741 nswbuf = NSWBUF_MIN; 742 #endif 743 #ifdef DIRECTIO 744 ffs_rawread_setup(); 745 #endif 746 747 /* 748 * Reserve space for the buffer cache buffers 749 */ 750 swbuf = (void *)v; 751 v = (caddr_t)(swbuf + nswbuf); 752 buf = (void *)v; 753 v = (caddr_t)(buf + nbuf); 754 755 return(v); 756 } 757 758 /* Initialize the buffer subsystem. Called before use of any buffers. */ 759 void 760 bufinit(void) 761 { 762 struct buf *bp; 763 int i; 764 765 mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF); 766 mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF); 767 mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); 768 mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); 769 mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); 770 mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); 771 772 /* next, make a null set of free lists */ 773 for (i = 0; i < BUFFER_QUEUES; i++) 774 TAILQ_INIT(&bufqueues[i]); 775 776 /* finally, initialize each buffer header and stick on empty q */ 777 for (i = 0; i < nbuf; i++) { 778 bp = &buf[i]; 779 bzero(bp, sizeof *bp); 780 bp->b_flags = B_INVAL | B_INFREECNT; 781 bp->b_rcred = NOCRED; 782 bp->b_wcred = NOCRED; 783 bp->b_qindex = QUEUE_EMPTY; 784 bp->b_xflags = 0; 785 LIST_INIT(&bp->b_dep); 786 BUF_LOCKINIT(bp); 787 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 788 #ifdef INVARIANTS 789 bq_len[QUEUE_EMPTY]++; 790 #endif 791 } 792 793 /* 794 * maxbufspace is the absolute maximum amount of buffer space we are 795 * allowed to reserve in KVM and in real terms. The absolute maximum 796 * is nominally used by buf_daemon. hibufspace is the nominal maximum 797 * used by most other processes. The differential is required to 798 * ensure that buf_daemon is able to run when other processes might 799 * be blocked waiting for buffer space. 800 * 801 * maxbufspace is based on BKVASIZE. Allocating buffers larger then 802 * this may result in KVM fragmentation which is not handled optimally 803 * by the system. 804 */ 805 maxbufspace = (long)nbuf * BKVASIZE; 806 hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); 807 lobufspace = hibufspace - MAXBSIZE; 808 809 /* 810 * Note: The 16 MiB upper limit for hirunningspace was chosen 811 * arbitrarily and may need further tuning. It corresponds to 812 * 128 outstanding write IO requests (if IO size is 128 KiB), 813 * which fits with many RAID controllers' tagged queuing limits. 814 * The lower 1 MiB limit is the historical upper limit for 815 * hirunningspace. 816 */ 817 hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE), 818 16 * 1024 * 1024), 1024 * 1024); 819 lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE); 820 821 /* 822 * Limit the amount of malloc memory since it is wired permanently into 823 * the kernel space. Even though this is accounted for in the buffer 824 * allocation, we don't want the malloced region to grow uncontrolled. 825 * The malloc scheme improves memory utilization significantly on average 826 * (small) directories. 827 */ 828 maxbufmallocspace = hibufspace / 20; 829 830 /* 831 * Reduce the chance of a deadlock occuring by limiting the number 832 * of delayed-write dirty buffers we allow to stack up. 833 */ 834 hidirtybuffers = nbuf / 4 + 20; 835 dirtybufthresh = hidirtybuffers * 9 / 10; 836 numdirtybuffers = 0; 837 /* 838 * To support extreme low-memory systems, make sure hidirtybuffers cannot 839 * eat up all available buffer space. This occurs when our minimum cannot 840 * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming 841 * BKVASIZE'd buffers. 842 */ 843 while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { 844 hidirtybuffers >>= 1; 845 } 846 lodirtybuffers = hidirtybuffers / 2; 847 848 /* 849 * Try to keep the number of free buffers in the specified range, 850 * and give special processes (e.g. like buf_daemon) access to an 851 * emergency reserve. 852 */ 853 lofreebuffers = nbuf / 18 + 5; 854 hifreebuffers = 2 * lofreebuffers; 855 numfreebuffers = nbuf; 856 857 bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | 858 VM_ALLOC_NORMAL | VM_ALLOC_WIRED); 859 unmapped_buf = (caddr_t)kmem_alloc_nofault(kernel_map, MAXPHYS); 860 } 861 862 #ifdef INVARIANTS 863 static inline void 864 vfs_buf_check_mapped(struct buf *bp) 865 { 866 867 KASSERT((bp->b_flags & B_UNMAPPED) == 0, 868 ("mapped buf %p %x", bp, bp->b_flags)); 869 KASSERT(bp->b_kvabase != unmapped_buf, 870 ("mapped buf: b_kvabase was not updated %p", bp)); 871 KASSERT(bp->b_data != unmapped_buf, 872 ("mapped buf: b_data was not updated %p", bp)); 873 } 874 875 static inline void 876 vfs_buf_check_unmapped(struct buf *bp) 877 { 878 879 KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED, 880 ("unmapped buf %p %x", bp, bp->b_flags)); 881 KASSERT(bp->b_kvabase == unmapped_buf, 882 ("unmapped buf: corrupted b_kvabase %p", bp)); 883 KASSERT(bp->b_data == unmapped_buf, 884 ("unmapped buf: corrupted b_data %p", bp)); 885 } 886 887 #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) 888 #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) 889 #else 890 #define BUF_CHECK_MAPPED(bp) do {} while (0) 891 #define BUF_CHECK_UNMAPPED(bp) do {} while (0) 892 #endif 893 894 static void 895 bpmap_qenter(struct buf *bp) 896 { 897 898 BUF_CHECK_MAPPED(bp); 899 900 /* 901 * bp->b_data is relative to bp->b_offset, but 902 * bp->b_offset may be offset into the first page. 903 */ 904 bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); 905 pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); 906 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 907 (vm_offset_t)(bp->b_offset & PAGE_MASK)); 908 } 909 910 /* 911 * bfreekva() - free the kva allocation for a buffer. 912 * 913 * Since this call frees up buffer space, we call bufspacewakeup(). 914 */ 915 static void 916 bfreekva(struct buf *bp) 917 { 918 919 if (bp->b_kvasize == 0) 920 return; 921 922 atomic_add_int(&buffreekvacnt, 1); 923 atomic_subtract_long(&bufspace, bp->b_kvasize); 924 if ((bp->b_flags & B_UNMAPPED) == 0) { 925 BUF_CHECK_MAPPED(bp); 926 vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, 927 bp->b_kvasize); 928 } else { 929 BUF_CHECK_UNMAPPED(bp); 930 if ((bp->b_flags & B_KVAALLOC) != 0) { 931 vmem_free(buffer_arena, (vm_offset_t)bp->b_kvaalloc, 932 bp->b_kvasize); 933 } 934 atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize); 935 bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC); 936 } 937 bp->b_kvasize = 0; 938 bufspacewakeup(); 939 } 940 941 /* 942 * binsfree: 943 * 944 * Insert the buffer into the appropriate free list. 945 */ 946 static void 947 binsfree(struct buf *bp, int qindex) 948 { 949 struct mtx *olock, *nlock; 950 951 BUF_ASSERT_XLOCKED(bp); 952 953 olock = bqlock(bp->b_qindex); 954 nlock = bqlock(qindex); 955 mtx_lock(olock); 956 /* Handle delayed bremfree() processing. */ 957 if (bp->b_flags & B_REMFREE) 958 bremfreel(bp); 959 960 if (bp->b_qindex != QUEUE_NONE) 961 panic("binsfree: free buffer onto another queue???"); 962 963 bp->b_qindex = qindex; 964 if (olock != nlock) { 965 mtx_unlock(olock); 966 mtx_lock(nlock); 967 } 968 if (bp->b_flags & B_AGE) 969 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 970 else 971 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); 972 #ifdef INVARIANTS 973 bq_len[bp->b_qindex]++; 974 #endif 975 mtx_unlock(nlock); 976 977 /* 978 * Something we can maybe free or reuse. 979 */ 980 if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) 981 bufspacewakeup(); 982 983 if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) 984 bufcountadd(bp); 985 } 986 987 /* 988 * bremfree: 989 * 990 * Mark the buffer for removal from the appropriate free list. 991 * 992 */ 993 void 994 bremfree(struct buf *bp) 995 { 996 997 CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 998 KASSERT((bp->b_flags & B_REMFREE) == 0, 999 ("bremfree: buffer %p already marked for delayed removal.", bp)); 1000 KASSERT(bp->b_qindex != QUEUE_NONE, 1001 ("bremfree: buffer %p not on a queue.", bp)); 1002 BUF_ASSERT_XLOCKED(bp); 1003 1004 bp->b_flags |= B_REMFREE; 1005 bufcountsub(bp); 1006 } 1007 1008 /* 1009 * bremfreef: 1010 * 1011 * Force an immediate removal from a free list. Used only in nfs when 1012 * it abuses the b_freelist pointer. 1013 */ 1014 void 1015 bremfreef(struct buf *bp) 1016 { 1017 struct mtx *qlock; 1018 1019 qlock = bqlock(bp->b_qindex); 1020 mtx_lock(qlock); 1021 bremfreel(bp); 1022 mtx_unlock(qlock); 1023 } 1024 1025 /* 1026 * bremfreel: 1027 * 1028 * Removes a buffer from the free list, must be called with the 1029 * correct qlock held. 1030 */ 1031 static void 1032 bremfreel(struct buf *bp) 1033 { 1034 1035 CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", 1036 bp, bp->b_vp, bp->b_flags); 1037 KASSERT(bp->b_qindex != QUEUE_NONE, 1038 ("bremfreel: buffer %p not on a queue.", bp)); 1039 BUF_ASSERT_XLOCKED(bp); 1040 mtx_assert(bqlock(bp->b_qindex), MA_OWNED); 1041 1042 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 1043 #ifdef INVARIANTS 1044 KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", 1045 bp->b_qindex)); 1046 bq_len[bp->b_qindex]--; 1047 #endif 1048 bp->b_qindex = QUEUE_NONE; 1049 /* 1050 * If this was a delayed bremfree() we only need to remove the buffer 1051 * from the queue and return the stats are already done. 1052 */ 1053 if (bp->b_flags & B_REMFREE) { 1054 bp->b_flags &= ~B_REMFREE; 1055 return; 1056 } 1057 bufcountsub(bp); 1058 } 1059 1060 /* 1061 * Attempt to initiate asynchronous I/O on read-ahead blocks. We must 1062 * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, 1063 * the buffer is valid and we do not have to do anything. 1064 */ 1065 void 1066 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, 1067 int cnt, struct ucred * cred) 1068 { 1069 struct buf *rabp; 1070 int i; 1071 1072 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 1073 if (inmem(vp, *rablkno)) 1074 continue; 1075 rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); 1076 1077 if ((rabp->b_flags & B_CACHE) == 0) { 1078 if (!TD_IS_IDLETHREAD(curthread)) 1079 curthread->td_ru.ru_inblock++; 1080 rabp->b_flags |= B_ASYNC; 1081 rabp->b_flags &= ~B_INVAL; 1082 rabp->b_ioflags &= ~BIO_ERROR; 1083 rabp->b_iocmd = BIO_READ; 1084 if (rabp->b_rcred == NOCRED && cred != NOCRED) 1085 rabp->b_rcred = crhold(cred); 1086 vfs_busy_pages(rabp, 0); 1087 BUF_KERNPROC(rabp); 1088 rabp->b_iooffset = dbtob(rabp->b_blkno); 1089 bstrategy(rabp); 1090 } else { 1091 brelse(rabp); 1092 } 1093 } 1094 } 1095 1096 /* 1097 * Entry point for bread() and breadn() via #defines in sys/buf.h. 1098 * 1099 * Get a buffer with the specified data. Look in the cache first. We 1100 * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE 1101 * is set, the buffer is valid and we do not have to do anything, see 1102 * getblk(). Also starts asynchronous I/O on read-ahead blocks. 1103 */ 1104 int 1105 breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, 1106 int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) 1107 { 1108 struct buf *bp; 1109 int rv = 0, readwait = 0; 1110 1111 CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); 1112 /* 1113 * Can only return NULL if GB_LOCK_NOWAIT flag is specified. 1114 */ 1115 *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); 1116 if (bp == NULL) 1117 return (EBUSY); 1118 1119 /* if not found in cache, do some I/O */ 1120 if ((bp->b_flags & B_CACHE) == 0) { 1121 if (!TD_IS_IDLETHREAD(curthread)) 1122 curthread->td_ru.ru_inblock++; 1123 bp->b_iocmd = BIO_READ; 1124 bp->b_flags &= ~B_INVAL; 1125 bp->b_ioflags &= ~BIO_ERROR; 1126 if (bp->b_rcred == NOCRED && cred != NOCRED) 1127 bp->b_rcred = crhold(cred); 1128 vfs_busy_pages(bp, 0); 1129 bp->b_iooffset = dbtob(bp->b_blkno); 1130 bstrategy(bp); 1131 ++readwait; 1132 } 1133 1134 breada(vp, rablkno, rabsize, cnt, cred); 1135 1136 if (readwait) { 1137 rv = bufwait(bp); 1138 } 1139 return (rv); 1140 } 1141 1142 /* 1143 * Write, release buffer on completion. (Done by iodone 1144 * if async). Do not bother writing anything if the buffer 1145 * is invalid. 1146 * 1147 * Note that we set B_CACHE here, indicating that buffer is 1148 * fully valid and thus cacheable. This is true even of NFS 1149 * now so we set it generally. This could be set either here 1150 * or in biodone() since the I/O is synchronous. We put it 1151 * here. 1152 */ 1153 int 1154 bufwrite(struct buf *bp) 1155 { 1156 int oldflags; 1157 struct vnode *vp; 1158 long space; 1159 int vp_md; 1160 1161 CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1162 if (bp->b_flags & B_INVAL) { 1163 brelse(bp); 1164 return (0); 1165 } 1166 1167 if (bp->b_flags & B_BARRIER) 1168 barrierwrites++; 1169 1170 oldflags = bp->b_flags; 1171 1172 BUF_ASSERT_HELD(bp); 1173 1174 if (bp->b_pin_count > 0) 1175 bunpin_wait(bp); 1176 1177 KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), 1178 ("FFS background buffer should not get here %p", bp)); 1179 1180 vp = bp->b_vp; 1181 if (vp) 1182 vp_md = vp->v_vflag & VV_MD; 1183 else 1184 vp_md = 0; 1185 1186 /* 1187 * Mark the buffer clean. Increment the bufobj write count 1188 * before bundirty() call, to prevent other thread from seeing 1189 * empty dirty list and zero counter for writes in progress, 1190 * falsely indicating that the bufobj is clean. 1191 */ 1192 bufobj_wref(bp->b_bufobj); 1193 bundirty(bp); 1194 1195 bp->b_flags &= ~B_DONE; 1196 bp->b_ioflags &= ~BIO_ERROR; 1197 bp->b_flags |= B_CACHE; 1198 bp->b_iocmd = BIO_WRITE; 1199 1200 vfs_busy_pages(bp, 1); 1201 1202 /* 1203 * Normal bwrites pipeline writes 1204 */ 1205 bp->b_runningbufspace = bp->b_bufsize; 1206 space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); 1207 1208 if (!TD_IS_IDLETHREAD(curthread)) 1209 curthread->td_ru.ru_oublock++; 1210 if (oldflags & B_ASYNC) 1211 BUF_KERNPROC(bp); 1212 bp->b_iooffset = dbtob(bp->b_blkno); 1213 bstrategy(bp); 1214 1215 if ((oldflags & B_ASYNC) == 0) { 1216 int rtval = bufwait(bp); 1217 brelse(bp); 1218 return (rtval); 1219 } else if (space > hirunningspace) { 1220 /* 1221 * don't allow the async write to saturate the I/O 1222 * system. We will not deadlock here because 1223 * we are blocking waiting for I/O that is already in-progress 1224 * to complete. We do not block here if it is the update 1225 * or syncer daemon trying to clean up as that can lead 1226 * to deadlock. 1227 */ 1228 if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) 1229 waitrunningbufspace(); 1230 } 1231 1232 return (0); 1233 } 1234 1235 void 1236 bufbdflush(struct bufobj *bo, struct buf *bp) 1237 { 1238 struct buf *nbp; 1239 1240 if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { 1241 (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); 1242 altbufferflushes++; 1243 } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { 1244 BO_LOCK(bo); 1245 /* 1246 * Try to find a buffer to flush. 1247 */ 1248 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { 1249 if ((nbp->b_vflags & BV_BKGRDINPROG) || 1250 BUF_LOCK(nbp, 1251 LK_EXCLUSIVE | LK_NOWAIT, NULL)) 1252 continue; 1253 if (bp == nbp) 1254 panic("bdwrite: found ourselves"); 1255 BO_UNLOCK(bo); 1256 /* Don't countdeps with the bo lock held. */ 1257 if (buf_countdeps(nbp, 0)) { 1258 BO_LOCK(bo); 1259 BUF_UNLOCK(nbp); 1260 continue; 1261 } 1262 if (nbp->b_flags & B_CLUSTEROK) { 1263 vfs_bio_awrite(nbp); 1264 } else { 1265 bremfree(nbp); 1266 bawrite(nbp); 1267 } 1268 dirtybufferflushes++; 1269 break; 1270 } 1271 if (nbp == NULL) 1272 BO_UNLOCK(bo); 1273 } 1274 } 1275 1276 /* 1277 * Delayed write. (Buffer is marked dirty). Do not bother writing 1278 * anything if the buffer is marked invalid. 1279 * 1280 * Note that since the buffer must be completely valid, we can safely 1281 * set B_CACHE. In fact, we have to set B_CACHE here rather then in 1282 * biodone() in order to prevent getblk from writing the buffer 1283 * out synchronously. 1284 */ 1285 void 1286 bdwrite(struct buf *bp) 1287 { 1288 struct thread *td = curthread; 1289 struct vnode *vp; 1290 struct bufobj *bo; 1291 1292 CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1293 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1294 KASSERT((bp->b_flags & B_BARRIER) == 0, 1295 ("Barrier request in delayed write %p", bp)); 1296 BUF_ASSERT_HELD(bp); 1297 1298 if (bp->b_flags & B_INVAL) { 1299 brelse(bp); 1300 return; 1301 } 1302 1303 /* 1304 * If we have too many dirty buffers, don't create any more. 1305 * If we are wildly over our limit, then force a complete 1306 * cleanup. Otherwise, just keep the situation from getting 1307 * out of control. Note that we have to avoid a recursive 1308 * disaster and not try to clean up after our own cleanup! 1309 */ 1310 vp = bp->b_vp; 1311 bo = bp->b_bufobj; 1312 if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { 1313 td->td_pflags |= TDP_INBDFLUSH; 1314 BO_BDFLUSH(bo, bp); 1315 td->td_pflags &= ~TDP_INBDFLUSH; 1316 } else 1317 recursiveflushes++; 1318 1319 bdirty(bp); 1320 /* 1321 * Set B_CACHE, indicating that the buffer is fully valid. This is 1322 * true even of NFS now. 1323 */ 1324 bp->b_flags |= B_CACHE; 1325 1326 /* 1327 * This bmap keeps the system from needing to do the bmap later, 1328 * perhaps when the system is attempting to do a sync. Since it 1329 * is likely that the indirect block -- or whatever other datastructure 1330 * that the filesystem needs is still in memory now, it is a good 1331 * thing to do this. Note also, that if the pageout daemon is 1332 * requesting a sync -- there might not be enough memory to do 1333 * the bmap then... So, this is important to do. 1334 */ 1335 if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { 1336 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 1337 } 1338 1339 /* 1340 * Set the *dirty* buffer range based upon the VM system dirty 1341 * pages. 1342 * 1343 * Mark the buffer pages as clean. We need to do this here to 1344 * satisfy the vnode_pager and the pageout daemon, so that it 1345 * thinks that the pages have been "cleaned". Note that since 1346 * the pages are in a delayed write buffer -- the VFS layer 1347 * "will" see that the pages get written out on the next sync, 1348 * or perhaps the cluster will be completed. 1349 */ 1350 vfs_clean_pages_dirty_buf(bp); 1351 bqrelse(bp); 1352 1353 /* 1354 * note: we cannot initiate I/O from a bdwrite even if we wanted to, 1355 * due to the softdep code. 1356 */ 1357 } 1358 1359 /* 1360 * bdirty: 1361 * 1362 * Turn buffer into delayed write request. We must clear BIO_READ and 1363 * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to 1364 * itself to properly update it in the dirty/clean lists. We mark it 1365 * B_DONE to ensure that any asynchronization of the buffer properly 1366 * clears B_DONE ( else a panic will occur later ). 1367 * 1368 * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which 1369 * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() 1370 * should only be called if the buffer is known-good. 1371 * 1372 * Since the buffer is not on a queue, we do not update the numfreebuffers 1373 * count. 1374 * 1375 * The buffer must be on QUEUE_NONE. 1376 */ 1377 void 1378 bdirty(struct buf *bp) 1379 { 1380 1381 CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", 1382 bp, bp->b_vp, bp->b_flags); 1383 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1384 KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, 1385 ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1386 BUF_ASSERT_HELD(bp); 1387 bp->b_flags &= ~(B_RELBUF); 1388 bp->b_iocmd = BIO_WRITE; 1389 1390 if ((bp->b_flags & B_DELWRI) == 0) { 1391 bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; 1392 reassignbuf(bp); 1393 bdirtyadd(); 1394 } 1395 } 1396 1397 /* 1398 * bundirty: 1399 * 1400 * Clear B_DELWRI for buffer. 1401 * 1402 * Since the buffer is not on a queue, we do not update the numfreebuffers 1403 * count. 1404 * 1405 * The buffer must be on QUEUE_NONE. 1406 */ 1407 1408 void 1409 bundirty(struct buf *bp) 1410 { 1411 1412 CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1413 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1414 KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, 1415 ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1416 BUF_ASSERT_HELD(bp); 1417 1418 if (bp->b_flags & B_DELWRI) { 1419 bp->b_flags &= ~B_DELWRI; 1420 reassignbuf(bp); 1421 bdirtysub(); 1422 } 1423 /* 1424 * Since it is now being written, we can clear its deferred write flag. 1425 */ 1426 bp->b_flags &= ~B_DEFERRED; 1427 } 1428 1429 /* 1430 * bawrite: 1431 * 1432 * Asynchronous write. Start output on a buffer, but do not wait for 1433 * it to complete. The buffer is released when the output completes. 1434 * 1435 * bwrite() ( or the VOP routine anyway ) is responsible for handling 1436 * B_INVAL buffers. Not us. 1437 */ 1438 void 1439 bawrite(struct buf *bp) 1440 { 1441 1442 bp->b_flags |= B_ASYNC; 1443 (void) bwrite(bp); 1444 } 1445 1446 /* 1447 * babarrierwrite: 1448 * 1449 * Asynchronous barrier write. Start output on a buffer, but do not 1450 * wait for it to complete. Place a write barrier after this write so 1451 * that this buffer and all buffers written before it are committed to 1452 * the disk before any buffers written after this write are committed 1453 * to the disk. The buffer is released when the output completes. 1454 */ 1455 void 1456 babarrierwrite(struct buf *bp) 1457 { 1458 1459 bp->b_flags |= B_ASYNC | B_BARRIER; 1460 (void) bwrite(bp); 1461 } 1462 1463 /* 1464 * bbarrierwrite: 1465 * 1466 * Synchronous barrier write. Start output on a buffer and wait for 1467 * it to complete. Place a write barrier after this write so that 1468 * this buffer and all buffers written before it are committed to 1469 * the disk before any buffers written after this write are committed 1470 * to the disk. The buffer is released when the output completes. 1471 */ 1472 int 1473 bbarrierwrite(struct buf *bp) 1474 { 1475 1476 bp->b_flags |= B_BARRIER; 1477 return (bwrite(bp)); 1478 } 1479 1480 /* 1481 * bwillwrite: 1482 * 1483 * Called prior to the locking of any vnodes when we are expecting to 1484 * write. We do not want to starve the buffer cache with too many 1485 * dirty buffers so we block here. By blocking prior to the locking 1486 * of any vnodes we attempt to avoid the situation where a locked vnode 1487 * prevents the various system daemons from flushing related buffers. 1488 */ 1489 void 1490 bwillwrite(void) 1491 { 1492 1493 if (numdirtybuffers >= hidirtybuffers) { 1494 mtx_lock(&bdirtylock); 1495 while (numdirtybuffers >= hidirtybuffers) { 1496 bdirtywait = 1; 1497 msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), 1498 "flswai", 0); 1499 } 1500 mtx_unlock(&bdirtylock); 1501 } 1502 } 1503 1504 /* 1505 * Return true if we have too many dirty buffers. 1506 */ 1507 int 1508 buf_dirty_count_severe(void) 1509 { 1510 1511 return(numdirtybuffers >= hidirtybuffers); 1512 } 1513 1514 static __noinline int 1515 buf_vm_page_count_severe(void) 1516 { 1517 1518 KFAIL_POINT_CODE(DEBUG_FP, buf_pressure, return 1); 1519 1520 return vm_page_count_severe(); 1521 } 1522 1523 /* 1524 * brelse: 1525 * 1526 * Release a busy buffer and, if requested, free its resources. The 1527 * buffer will be stashed in the appropriate bufqueue[] allowing it 1528 * to be accessed later as a cache entity or reused for other purposes. 1529 */ 1530 void 1531 brelse(struct buf *bp) 1532 { 1533 int qindex; 1534 1535 CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", 1536 bp, bp->b_vp, bp->b_flags); 1537 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 1538 ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1539 1540 if (BUF_LOCKRECURSED(bp)) { 1541 /* 1542 * Do not process, in particular, do not handle the 1543 * B_INVAL/B_RELBUF and do not release to free list. 1544 */ 1545 BUF_UNLOCK(bp); 1546 return; 1547 } 1548 1549 if (bp->b_flags & B_MANAGED) { 1550 bqrelse(bp); 1551 return; 1552 } 1553 1554 if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && 1555 bp->b_error == EIO && !(bp->b_flags & B_INVAL)) { 1556 /* 1557 * Failed write, redirty. Must clear BIO_ERROR to prevent 1558 * pages from being scrapped. If the error is anything 1559 * other than an I/O error (EIO), assume that retrying 1560 * is futile. 1561 */ 1562 bp->b_ioflags &= ~BIO_ERROR; 1563 bdirty(bp); 1564 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || 1565 (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { 1566 /* 1567 * Either a failed I/O or we were asked to free or not 1568 * cache the buffer. 1569 */ 1570 bp->b_flags |= B_INVAL; 1571 if (!LIST_EMPTY(&bp->b_dep)) 1572 buf_deallocate(bp); 1573 if (bp->b_flags & B_DELWRI) 1574 bdirtysub(); 1575 bp->b_flags &= ~(B_DELWRI | B_CACHE); 1576 if ((bp->b_flags & B_VMIO) == 0) { 1577 if (bp->b_bufsize) 1578 allocbuf(bp, 0); 1579 if (bp->b_vp) 1580 brelvp(bp); 1581 } 1582 } 1583 1584 /* 1585 * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() 1586 * is called with B_DELWRI set, the underlying pages may wind up 1587 * getting freed causing a previous write (bdwrite()) to get 'lost' 1588 * because pages associated with a B_DELWRI bp are marked clean. 1589 * 1590 * We still allow the B_INVAL case to call vfs_vmio_release(), even 1591 * if B_DELWRI is set. 1592 * 1593 * If B_DELWRI is not set we may have to set B_RELBUF if we are low 1594 * on pages to return pages to the VM page queues. 1595 */ 1596 if (bp->b_flags & B_DELWRI) 1597 bp->b_flags &= ~B_RELBUF; 1598 else if (buf_vm_page_count_severe()) { 1599 /* 1600 * BKGRDINPROG can only be set with the buf and bufobj 1601 * locks both held. We tolerate a race to clear it here. 1602 */ 1603 if (!(bp->b_vflags & BV_BKGRDINPROG)) 1604 bp->b_flags |= B_RELBUF; 1605 } 1606 1607 /* 1608 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 1609 * constituted, not even NFS buffers now. Two flags effect this. If 1610 * B_INVAL, the struct buf is invalidated but the VM object is kept 1611 * around ( i.e. so it is trivial to reconstitute the buffer later ). 1612 * 1613 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be 1614 * invalidated. BIO_ERROR cannot be set for a failed write unless the 1615 * buffer is also B_INVAL because it hits the re-dirtying code above. 1616 * 1617 * Normally we can do this whether a buffer is B_DELWRI or not. If 1618 * the buffer is an NFS buffer, it is tracking piecemeal writes or 1619 * the commit state and we cannot afford to lose the buffer. If the 1620 * buffer has a background write in progress, we need to keep it 1621 * around to prevent it from being reconstituted and starting a second 1622 * background write. 1623 */ 1624 if ((bp->b_flags & B_VMIO) 1625 && !(bp->b_vp->v_mount != NULL && 1626 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && 1627 !vn_isdisk(bp->b_vp, NULL) && 1628 (bp->b_flags & B_DELWRI)) 1629 ) { 1630 1631 int i, j, resid; 1632 vm_page_t m; 1633 off_t foff; 1634 vm_pindex_t poff; 1635 vm_object_t obj; 1636 1637 obj = bp->b_bufobj->bo_object; 1638 1639 /* 1640 * Get the base offset and length of the buffer. Note that 1641 * in the VMIO case if the buffer block size is not 1642 * page-aligned then b_data pointer may not be page-aligned. 1643 * But our b_pages[] array *IS* page aligned. 1644 * 1645 * block sizes less then DEV_BSIZE (usually 512) are not 1646 * supported due to the page granularity bits (m->valid, 1647 * m->dirty, etc...). 1648 * 1649 * See man buf(9) for more information 1650 */ 1651 resid = bp->b_bufsize; 1652 foff = bp->b_offset; 1653 for (i = 0; i < bp->b_npages; i++) { 1654 int had_bogus = 0; 1655 1656 m = bp->b_pages[i]; 1657 1658 /* 1659 * If we hit a bogus page, fixup *all* the bogus pages 1660 * now. 1661 */ 1662 if (m == bogus_page) { 1663 poff = OFF_TO_IDX(bp->b_offset); 1664 had_bogus = 1; 1665 1666 VM_OBJECT_RLOCK(obj); 1667 for (j = i; j < bp->b_npages; j++) { 1668 vm_page_t mtmp; 1669 mtmp = bp->b_pages[j]; 1670 if (mtmp == bogus_page) { 1671 mtmp = vm_page_lookup(obj, poff + j); 1672 if (!mtmp) { 1673 panic("brelse: page missing\n"); 1674 } 1675 bp->b_pages[j] = mtmp; 1676 } 1677 } 1678 VM_OBJECT_RUNLOCK(obj); 1679 1680 if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) { 1681 BUF_CHECK_MAPPED(bp); 1682 pmap_qenter( 1683 trunc_page((vm_offset_t)bp->b_data), 1684 bp->b_pages, bp->b_npages); 1685 } 1686 m = bp->b_pages[i]; 1687 } 1688 if ((bp->b_flags & B_NOCACHE) || 1689 (bp->b_ioflags & BIO_ERROR && 1690 bp->b_iocmd == BIO_READ)) { 1691 int poffset = foff & PAGE_MASK; 1692 int presid = resid > (PAGE_SIZE - poffset) ? 1693 (PAGE_SIZE - poffset) : resid; 1694 1695 KASSERT(presid >= 0, ("brelse: extra page")); 1696 VM_OBJECT_WLOCK(obj); 1697 if (pmap_page_wired_mappings(m) == 0) 1698 vm_page_set_invalid(m, poffset, presid); 1699 VM_OBJECT_WUNLOCK(obj); 1700 if (had_bogus) 1701 printf("avoided corruption bug in bogus_page/brelse code\n"); 1702 } 1703 resid -= PAGE_SIZE - (foff & PAGE_MASK); 1704 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 1705 } 1706 if (bp->b_flags & (B_INVAL | B_RELBUF)) 1707 vfs_vmio_release(bp); 1708 1709 } else if (bp->b_flags & B_VMIO) { 1710 1711 if (bp->b_flags & (B_INVAL | B_RELBUF)) { 1712 vfs_vmio_release(bp); 1713 } 1714 1715 } else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) { 1716 if (bp->b_bufsize != 0) 1717 allocbuf(bp, 0); 1718 if (bp->b_vp != NULL) 1719 brelvp(bp); 1720 } 1721 1722 /* 1723 * If the buffer has junk contents signal it and eventually 1724 * clean up B_DELWRI and diassociate the vnode so that gbincore() 1725 * doesn't find it. 1726 */ 1727 if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || 1728 (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) 1729 bp->b_flags |= B_INVAL; 1730 if (bp->b_flags & B_INVAL) { 1731 if (bp->b_flags & B_DELWRI) 1732 bundirty(bp); 1733 if (bp->b_vp) 1734 brelvp(bp); 1735 } 1736 1737 /* buffers with no memory */ 1738 if (bp->b_bufsize == 0) { 1739 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); 1740 if (bp->b_vflags & BV_BKGRDINPROG) 1741 panic("losing buffer 1"); 1742 if (bp->b_kvasize) 1743 qindex = QUEUE_EMPTYKVA; 1744 else 1745 qindex = QUEUE_EMPTY; 1746 bp->b_flags |= B_AGE; 1747 /* buffers with junk contents */ 1748 } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || 1749 (bp->b_ioflags & BIO_ERROR)) { 1750 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); 1751 if (bp->b_vflags & BV_BKGRDINPROG) 1752 panic("losing buffer 2"); 1753 qindex = QUEUE_CLEAN; 1754 bp->b_flags |= B_AGE; 1755 /* remaining buffers */ 1756 } else if (bp->b_flags & B_DELWRI) 1757 qindex = QUEUE_DIRTY; 1758 else 1759 qindex = QUEUE_CLEAN; 1760 1761 binsfree(bp, qindex); 1762 1763 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); 1764 if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) 1765 panic("brelse: not dirty"); 1766 /* unlock */ 1767 BUF_UNLOCK(bp); 1768 } 1769 1770 /* 1771 * Release a buffer back to the appropriate queue but do not try to free 1772 * it. The buffer is expected to be used again soon. 1773 * 1774 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 1775 * biodone() to requeue an async I/O on completion. It is also used when 1776 * known good buffers need to be requeued but we think we may need the data 1777 * again soon. 1778 * 1779 * XXX we should be able to leave the B_RELBUF hint set on completion. 1780 */ 1781 void 1782 bqrelse(struct buf *bp) 1783 { 1784 int qindex; 1785 1786 CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1787 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 1788 ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1789 1790 if (BUF_LOCKRECURSED(bp)) { 1791 /* do not release to free list */ 1792 BUF_UNLOCK(bp); 1793 return; 1794 } 1795 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 1796 1797 if (bp->b_flags & B_MANAGED) { 1798 if (bp->b_flags & B_REMFREE) 1799 bremfreef(bp); 1800 goto out; 1801 } 1802 1803 /* buffers with stale but valid contents */ 1804 if (bp->b_flags & B_DELWRI) { 1805 qindex = QUEUE_DIRTY; 1806 } else { 1807 if ((bp->b_flags & B_DELWRI) == 0 && 1808 (bp->b_xflags & BX_VNDIRTY)) 1809 panic("bqrelse: not dirty"); 1810 /* 1811 * BKGRDINPROG can only be set with the buf and bufobj 1812 * locks both held. We tolerate a race to clear it here. 1813 */ 1814 if (buf_vm_page_count_severe() && 1815 (bp->b_vflags & BV_BKGRDINPROG) == 0) { 1816 /* 1817 * We are too low on memory, we have to try to free 1818 * the buffer (most importantly: the wired pages 1819 * making up its backing store) *now*. 1820 */ 1821 brelse(bp); 1822 return; 1823 } 1824 qindex = QUEUE_CLEAN; 1825 } 1826 binsfree(bp, qindex); 1827 1828 out: 1829 /* unlock */ 1830 BUF_UNLOCK(bp); 1831 } 1832 1833 /* Give pages used by the bp back to the VM system (where possible) */ 1834 static void 1835 vfs_vmio_release(struct buf *bp) 1836 { 1837 int i; 1838 vm_page_t m; 1839 1840 if ((bp->b_flags & B_UNMAPPED) == 0) { 1841 BUF_CHECK_MAPPED(bp); 1842 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); 1843 } else 1844 BUF_CHECK_UNMAPPED(bp); 1845 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 1846 for (i = 0; i < bp->b_npages; i++) { 1847 m = bp->b_pages[i]; 1848 bp->b_pages[i] = NULL; 1849 /* 1850 * In order to keep page LRU ordering consistent, put 1851 * everything on the inactive queue. 1852 */ 1853 vm_page_lock(m); 1854 vm_page_unwire(m, 0); 1855 /* 1856 * We don't mess with busy pages, it is 1857 * the responsibility of the process that 1858 * busied the pages to deal with them. 1859 */ 1860 if ((m->oflags & VPO_BUSY) == 0 && m->busy == 0 && 1861 m->wire_count == 0) { 1862 /* 1863 * Might as well free the page if we can and it has 1864 * no valid data. We also free the page if the 1865 * buffer was used for direct I/O 1866 */ 1867 if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) { 1868 vm_page_free(m); 1869 } else if (bp->b_flags & B_DIRECT) { 1870 vm_page_try_to_free(m); 1871 } else if (buf_vm_page_count_severe()) { 1872 vm_page_try_to_cache(m); 1873 } 1874 } 1875 vm_page_unlock(m); 1876 } 1877 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 1878 1879 if (bp->b_bufsize) { 1880 bufspacewakeup(); 1881 bp->b_bufsize = 0; 1882 } 1883 bp->b_npages = 0; 1884 bp->b_flags &= ~B_VMIO; 1885 if (bp->b_vp) 1886 brelvp(bp); 1887 } 1888 1889 /* 1890 * Check to see if a block at a particular lbn is available for a clustered 1891 * write. 1892 */ 1893 static int 1894 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) 1895 { 1896 struct buf *bpa; 1897 int match; 1898 1899 match = 0; 1900 1901 /* If the buf isn't in core skip it */ 1902 if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) 1903 return (0); 1904 1905 /* If the buf is busy we don't want to wait for it */ 1906 if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 1907 return (0); 1908 1909 /* Only cluster with valid clusterable delayed write buffers */ 1910 if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != 1911 (B_DELWRI | B_CLUSTEROK)) 1912 goto done; 1913 1914 if (bpa->b_bufsize != size) 1915 goto done; 1916 1917 /* 1918 * Check to see if it is in the expected place on disk and that the 1919 * block has been mapped. 1920 */ 1921 if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) 1922 match = 1; 1923 done: 1924 BUF_UNLOCK(bpa); 1925 return (match); 1926 } 1927 1928 /* 1929 * vfs_bio_awrite: 1930 * 1931 * Implement clustered async writes for clearing out B_DELWRI buffers. 1932 * This is much better then the old way of writing only one buffer at 1933 * a time. Note that we may not be presented with the buffers in the 1934 * correct order, so we search for the cluster in both directions. 1935 */ 1936 int 1937 vfs_bio_awrite(struct buf *bp) 1938 { 1939 struct bufobj *bo; 1940 int i; 1941 int j; 1942 daddr_t lblkno = bp->b_lblkno; 1943 struct vnode *vp = bp->b_vp; 1944 int ncl; 1945 int nwritten; 1946 int size; 1947 int maxcl; 1948 int gbflags; 1949 1950 bo = &vp->v_bufobj; 1951 gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 1952 /* 1953 * right now we support clustered writing only to regular files. If 1954 * we find a clusterable block we could be in the middle of a cluster 1955 * rather then at the beginning. 1956 */ 1957 if ((vp->v_type == VREG) && 1958 (vp->v_mount != 0) && /* Only on nodes that have the size info */ 1959 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 1960 1961 size = vp->v_mount->mnt_stat.f_iosize; 1962 maxcl = MAXPHYS / size; 1963 1964 BO_RLOCK(bo); 1965 for (i = 1; i < maxcl; i++) 1966 if (vfs_bio_clcheck(vp, size, lblkno + i, 1967 bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) 1968 break; 1969 1970 for (j = 1; i + j <= maxcl && j <= lblkno; j++) 1971 if (vfs_bio_clcheck(vp, size, lblkno - j, 1972 bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) 1973 break; 1974 BO_RUNLOCK(bo); 1975 --j; 1976 ncl = i + j; 1977 /* 1978 * this is a possible cluster write 1979 */ 1980 if (ncl != 1) { 1981 BUF_UNLOCK(bp); 1982 nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, 1983 gbflags); 1984 return (nwritten); 1985 } 1986 } 1987 bremfree(bp); 1988 bp->b_flags |= B_ASYNC; 1989 /* 1990 * default (old) behavior, writing out only one block 1991 * 1992 * XXX returns b_bufsize instead of b_bcount for nwritten? 1993 */ 1994 nwritten = bp->b_bufsize; 1995 (void) bwrite(bp); 1996 1997 return (nwritten); 1998 } 1999 2000 static void 2001 setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags) 2002 { 2003 2004 KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 && 2005 bp->b_kvasize == 0, ("call bfreekva(%p)", bp)); 2006 if ((gbflags & GB_UNMAPPED) == 0) { 2007 bp->b_kvabase = (caddr_t)addr; 2008 } else if ((gbflags & GB_KVAALLOC) != 0) { 2009 KASSERT((gbflags & GB_UNMAPPED) != 0, 2010 ("GB_KVAALLOC without GB_UNMAPPED")); 2011 bp->b_kvaalloc = (caddr_t)addr; 2012 bp->b_flags |= B_UNMAPPED | B_KVAALLOC; 2013 atomic_add_long(&unmapped_bufspace, bp->b_kvasize); 2014 } 2015 bp->b_kvasize = maxsize; 2016 } 2017 2018 /* 2019 * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if 2020 * needed. 2021 */ 2022 static int 2023 allocbufkva(struct buf *bp, int maxsize, int gbflags) 2024 { 2025 vm_offset_t addr; 2026 2027 bfreekva(bp); 2028 addr = 0; 2029 2030 if (vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr)) { 2031 /* 2032 * Buffer map is too fragmented. Request the caller 2033 * to defragment the map. 2034 */ 2035 atomic_add_int(&bufdefragcnt, 1); 2036 return (1); 2037 } 2038 setbufkva(bp, addr, maxsize, gbflags); 2039 atomic_add_long(&bufspace, bp->b_kvasize); 2040 return (0); 2041 } 2042 2043 /* 2044 * Ask the bufdaemon for help, or act as bufdaemon itself, when a 2045 * locked vnode is supplied. 2046 */ 2047 static void 2048 getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, 2049 int defrag) 2050 { 2051 struct thread *td; 2052 char *waitmsg; 2053 int fl, flags, norunbuf; 2054 2055 mtx_assert(&bqclean, MA_OWNED); 2056 2057 if (defrag) { 2058 flags = VFS_BIO_NEED_BUFSPACE; 2059 waitmsg = "nbufkv"; 2060 } else if (bufspace >= hibufspace) { 2061 waitmsg = "nbufbs"; 2062 flags = VFS_BIO_NEED_BUFSPACE; 2063 } else { 2064 waitmsg = "newbuf"; 2065 flags = VFS_BIO_NEED_ANY; 2066 } 2067 mtx_lock(&nblock); 2068 needsbuffer |= flags; 2069 mtx_unlock(&nblock); 2070 mtx_unlock(&bqclean); 2071 2072 bd_speedup(); /* heeeelp */ 2073 if ((gbflags & GB_NOWAIT_BD) != 0) 2074 return; 2075 2076 td = curthread; 2077 mtx_lock(&nblock); 2078 while (needsbuffer & flags) { 2079 if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) { 2080 mtx_unlock(&nblock); 2081 /* 2082 * getblk() is called with a vnode locked, and 2083 * some majority of the dirty buffers may as 2084 * well belong to the vnode. Flushing the 2085 * buffers there would make a progress that 2086 * cannot be achieved by the buf_daemon, that 2087 * cannot lock the vnode. 2088 */ 2089 norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | 2090 (td->td_pflags & TDP_NORUNNINGBUF); 2091 /* play bufdaemon */ 2092 td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; 2093 fl = buf_flush(vp, flushbufqtarget); 2094 td->td_pflags &= norunbuf; 2095 mtx_lock(&nblock); 2096 if (fl != 0) 2097 continue; 2098 if ((needsbuffer & flags) == 0) 2099 break; 2100 } 2101 if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag, 2102 waitmsg, slptimeo)) 2103 break; 2104 } 2105 mtx_unlock(&nblock); 2106 } 2107 2108 static void 2109 getnewbuf_reuse_bp(struct buf *bp, int qindex) 2110 { 2111 2112 CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " 2113 "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, 2114 bp->b_kvasize, bp->b_bufsize, qindex); 2115 mtx_assert(&bqclean, MA_NOTOWNED); 2116 2117 /* 2118 * Note: we no longer distinguish between VMIO and non-VMIO 2119 * buffers. 2120 */ 2121 KASSERT((bp->b_flags & B_DELWRI) == 0, 2122 ("delwri buffer %p found in queue %d", bp, qindex)); 2123 2124 if (qindex == QUEUE_CLEAN) { 2125 if (bp->b_flags & B_VMIO) { 2126 bp->b_flags &= ~B_ASYNC; 2127 vfs_vmio_release(bp); 2128 } 2129 if (bp->b_vp != NULL) 2130 brelvp(bp); 2131 } 2132 2133 /* 2134 * Get the rest of the buffer freed up. b_kva* is still valid 2135 * after this operation. 2136 */ 2137 2138 if (bp->b_rcred != NOCRED) { 2139 crfree(bp->b_rcred); 2140 bp->b_rcred = NOCRED; 2141 } 2142 if (bp->b_wcred != NOCRED) { 2143 crfree(bp->b_wcred); 2144 bp->b_wcred = NOCRED; 2145 } 2146 if (!LIST_EMPTY(&bp->b_dep)) 2147 buf_deallocate(bp); 2148 if (bp->b_vflags & BV_BKGRDINPROG) 2149 panic("losing buffer 3"); 2150 KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d", 2151 bp, bp->b_vp, qindex)); 2152 KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, 2153 ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); 2154 2155 if (bp->b_bufsize) 2156 allocbuf(bp, 0); 2157 2158 bp->b_flags &= B_UNMAPPED | B_KVAALLOC; 2159 bp->b_ioflags = 0; 2160 bp->b_xflags = 0; 2161 KASSERT((bp->b_flags & B_INFREECNT) == 0, 2162 ("buf %p still counted as free?", bp)); 2163 bp->b_vflags = 0; 2164 bp->b_vp = NULL; 2165 bp->b_blkno = bp->b_lblkno = 0; 2166 bp->b_offset = NOOFFSET; 2167 bp->b_iodone = 0; 2168 bp->b_error = 0; 2169 bp->b_resid = 0; 2170 bp->b_bcount = 0; 2171 bp->b_npages = 0; 2172 bp->b_dirtyoff = bp->b_dirtyend = 0; 2173 bp->b_bufobj = NULL; 2174 bp->b_pin_count = 0; 2175 bp->b_fsprivate1 = NULL; 2176 bp->b_fsprivate2 = NULL; 2177 bp->b_fsprivate3 = NULL; 2178 2179 LIST_INIT(&bp->b_dep); 2180 } 2181 2182 static int flushingbufs; 2183 2184 static struct buf * 2185 getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata) 2186 { 2187 struct buf *bp, *nbp; 2188 int nqindex, qindex, pass; 2189 2190 KASSERT(!unmapped || !defrag, ("both unmapped and defrag")); 2191 2192 pass = 1; 2193 restart: 2194 atomic_add_int(&getnewbufrestarts, 1); 2195 2196 /* 2197 * Setup for scan. If we do not have enough free buffers, 2198 * we setup a degenerate case that immediately fails. Note 2199 * that if we are specially marked process, we are allowed to 2200 * dip into our reserves. 2201 * 2202 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN 2203 * for the allocation of the mapped buffer. For unmapped, the 2204 * easiest is to start with EMPTY outright. 2205 * 2206 * We start with EMPTYKVA. If the list is empty we backup to EMPTY. 2207 * However, there are a number of cases (defragging, reusing, ...) 2208 * where we cannot backup. 2209 */ 2210 nbp = NULL; 2211 mtx_lock(&bqclean); 2212 if (!defrag && unmapped) { 2213 nqindex = QUEUE_EMPTY; 2214 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); 2215 } 2216 if (nbp == NULL) { 2217 nqindex = QUEUE_EMPTYKVA; 2218 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); 2219 } 2220 2221 /* 2222 * If no EMPTYKVA buffers and we are either defragging or 2223 * reusing, locate a CLEAN buffer to free or reuse. If 2224 * bufspace useage is low skip this step so we can allocate a 2225 * new buffer. 2226 */ 2227 if (nbp == NULL && (defrag || bufspace >= lobufspace)) { 2228 nqindex = QUEUE_CLEAN; 2229 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); 2230 } 2231 2232 /* 2233 * If we could not find or were not allowed to reuse a CLEAN 2234 * buffer, check to see if it is ok to use an EMPTY buffer. 2235 * We can only use an EMPTY buffer if allocating its KVA would 2236 * not otherwise run us out of buffer space. No KVA is needed 2237 * for the unmapped allocation. 2238 */ 2239 if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace || 2240 metadata)) { 2241 nqindex = QUEUE_EMPTY; 2242 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); 2243 } 2244 2245 /* 2246 * All available buffers might be clean, retry ignoring the 2247 * lobufspace as the last resort. 2248 */ 2249 if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) { 2250 nqindex = QUEUE_CLEAN; 2251 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); 2252 } 2253 2254 /* 2255 * Run scan, possibly freeing data and/or kva mappings on the fly 2256 * depending. 2257 */ 2258 while ((bp = nbp) != NULL) { 2259 qindex = nqindex; 2260 2261 /* 2262 * Calculate next bp (we can only use it if we do not 2263 * block or do other fancy things). 2264 */ 2265 if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { 2266 switch (qindex) { 2267 case QUEUE_EMPTY: 2268 nqindex = QUEUE_EMPTYKVA; 2269 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); 2270 if (nbp != NULL) 2271 break; 2272 /* FALLTHROUGH */ 2273 case QUEUE_EMPTYKVA: 2274 nqindex = QUEUE_CLEAN; 2275 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); 2276 if (nbp != NULL) 2277 break; 2278 /* FALLTHROUGH */ 2279 case QUEUE_CLEAN: 2280 if (metadata && pass == 1) { 2281 pass = 2; 2282 nqindex = QUEUE_EMPTY; 2283 nbp = TAILQ_FIRST( 2284 &bufqueues[QUEUE_EMPTY]); 2285 } 2286 /* 2287 * nbp is NULL. 2288 */ 2289 break; 2290 } 2291 } 2292 /* 2293 * If we are defragging then we need a buffer with 2294 * b_kvasize != 0. XXX this situation should no longer 2295 * occur, if defrag is non-zero the buffer's b_kvasize 2296 * should also be non-zero at this point. XXX 2297 */ 2298 if (defrag && bp->b_kvasize == 0) { 2299 printf("Warning: defrag empty buffer %p\n", bp); 2300 continue; 2301 } 2302 2303 /* 2304 * Start freeing the bp. This is somewhat involved. nbp 2305 * remains valid only for QUEUE_EMPTY[KVA] bp's. 2306 */ 2307 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 2308 continue; 2309 /* 2310 * BKGRDINPROG can only be set with the buf and bufobj 2311 * locks both held. We tolerate a race to clear it here. 2312 */ 2313 if (bp->b_vflags & BV_BKGRDINPROG) { 2314 BUF_UNLOCK(bp); 2315 continue; 2316 } 2317 2318 KASSERT(bp->b_qindex == qindex, 2319 ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); 2320 2321 bremfreel(bp); 2322 mtx_unlock(&bqclean); 2323 /* 2324 * NOTE: nbp is now entirely invalid. We can only restart 2325 * the scan from this point on. 2326 */ 2327 2328 getnewbuf_reuse_bp(bp, qindex); 2329 mtx_assert(&bqclean, MA_NOTOWNED); 2330 2331 /* 2332 * If we are defragging then free the buffer. 2333 */ 2334 if (defrag) { 2335 bp->b_flags |= B_INVAL; 2336 bfreekva(bp); 2337 brelse(bp); 2338 defrag = 0; 2339 goto restart; 2340 } 2341 2342 /* 2343 * Notify any waiters for the buffer lock about 2344 * identity change by freeing the buffer. 2345 */ 2346 if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) { 2347 bp->b_flags |= B_INVAL; 2348 bfreekva(bp); 2349 brelse(bp); 2350 goto restart; 2351 } 2352 2353 if (metadata) 2354 break; 2355 2356 /* 2357 * If we are overcomitted then recover the buffer and its 2358 * KVM space. This occurs in rare situations when multiple 2359 * processes are blocked in getnewbuf() or allocbuf(). 2360 */ 2361 if (bufspace >= hibufspace) 2362 flushingbufs = 1; 2363 if (flushingbufs && bp->b_kvasize != 0) { 2364 bp->b_flags |= B_INVAL; 2365 bfreekva(bp); 2366 brelse(bp); 2367 goto restart; 2368 } 2369 if (bufspace < lobufspace) 2370 flushingbufs = 0; 2371 break; 2372 } 2373 return (bp); 2374 } 2375 2376 /* 2377 * getnewbuf: 2378 * 2379 * Find and initialize a new buffer header, freeing up existing buffers 2380 * in the bufqueues as necessary. The new buffer is returned locked. 2381 * 2382 * Important: B_INVAL is not set. If the caller wishes to throw the 2383 * buffer away, the caller must set B_INVAL prior to calling brelse(). 2384 * 2385 * We block if: 2386 * We have insufficient buffer headers 2387 * We have insufficient buffer space 2388 * buffer_arena is too fragmented ( space reservation fails ) 2389 * If we have to flush dirty buffers ( but we try to avoid this ) 2390 */ 2391 static struct buf * 2392 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, 2393 int gbflags) 2394 { 2395 struct buf *bp; 2396 int defrag, metadata; 2397 2398 KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, 2399 ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); 2400 if (!unmapped_buf_allowed) 2401 gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); 2402 2403 defrag = 0; 2404 if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || 2405 vp->v_type == VCHR) 2406 metadata = 1; 2407 else 2408 metadata = 0; 2409 /* 2410 * We can't afford to block since we might be holding a vnode lock, 2411 * which may prevent system daemons from running. We deal with 2412 * low-memory situations by proactively returning memory and running 2413 * async I/O rather then sync I/O. 2414 */ 2415 atomic_add_int(&getnewbufcalls, 1); 2416 atomic_subtract_int(&getnewbufrestarts, 1); 2417 restart: 2418 bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED | 2419 GB_KVAALLOC)) == GB_UNMAPPED, metadata); 2420 if (bp != NULL) 2421 defrag = 0; 2422 2423 /* 2424 * If we exhausted our list, sleep as appropriate. We may have to 2425 * wakeup various daemons and write out some dirty buffers. 2426 * 2427 * Generally we are sleeping due to insufficient buffer space. 2428 */ 2429 if (bp == NULL) { 2430 mtx_assert(&bqclean, MA_OWNED); 2431 getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag); 2432 mtx_assert(&bqclean, MA_NOTOWNED); 2433 } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) { 2434 mtx_assert(&bqclean, MA_NOTOWNED); 2435 2436 bfreekva(bp); 2437 bp->b_flags |= B_UNMAPPED; 2438 bp->b_kvabase = bp->b_data = unmapped_buf; 2439 bp->b_kvasize = maxsize; 2440 atomic_add_long(&bufspace, bp->b_kvasize); 2441 atomic_add_long(&unmapped_bufspace, bp->b_kvasize); 2442 atomic_add_int(&bufreusecnt, 1); 2443 } else { 2444 mtx_assert(&bqclean, MA_NOTOWNED); 2445 2446 /* 2447 * We finally have a valid bp. We aren't quite out of the 2448 * woods, we still have to reserve kva space. In order 2449 * to keep fragmentation sane we only allocate kva in 2450 * BKVASIZE chunks. 2451 */ 2452 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; 2453 2454 if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED | 2455 B_KVAALLOC)) == B_UNMAPPED) { 2456 if (allocbufkva(bp, maxsize, gbflags)) { 2457 defrag = 1; 2458 bp->b_flags |= B_INVAL; 2459 brelse(bp); 2460 goto restart; 2461 } 2462 atomic_add_int(&bufreusecnt, 1); 2463 } else if ((bp->b_flags & B_KVAALLOC) != 0 && 2464 (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) { 2465 /* 2466 * If the reused buffer has KVA allocated, 2467 * reassign b_kvaalloc to b_kvabase. 2468 */ 2469 bp->b_kvabase = bp->b_kvaalloc; 2470 bp->b_flags &= ~B_KVAALLOC; 2471 atomic_subtract_long(&unmapped_bufspace, 2472 bp->b_kvasize); 2473 atomic_add_int(&bufreusecnt, 1); 2474 } else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 && 2475 (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED | 2476 GB_KVAALLOC)) { 2477 /* 2478 * The case of reused buffer already have KVA 2479 * mapped, but the request is for unmapped 2480 * buffer with KVA allocated. 2481 */ 2482 bp->b_kvaalloc = bp->b_kvabase; 2483 bp->b_data = bp->b_kvabase = unmapped_buf; 2484 bp->b_flags |= B_UNMAPPED | B_KVAALLOC; 2485 atomic_add_long(&unmapped_bufspace, 2486 bp->b_kvasize); 2487 atomic_add_int(&bufreusecnt, 1); 2488 } 2489 if ((gbflags & GB_UNMAPPED) == 0) { 2490 bp->b_saveaddr = bp->b_kvabase; 2491 bp->b_data = bp->b_saveaddr; 2492 bp->b_flags &= ~B_UNMAPPED; 2493 BUF_CHECK_MAPPED(bp); 2494 } 2495 } 2496 return (bp); 2497 } 2498 2499 /* 2500 * buf_daemon: 2501 * 2502 * buffer flushing daemon. Buffers are normally flushed by the 2503 * update daemon but if it cannot keep up this process starts to 2504 * take the load in an attempt to prevent getnewbuf() from blocking. 2505 */ 2506 2507 static struct kproc_desc buf_kp = { 2508 "bufdaemon", 2509 buf_daemon, 2510 &bufdaemonproc 2511 }; 2512 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); 2513 2514 static int 2515 buf_flush(struct vnode *vp, int target) 2516 { 2517 int flushed; 2518 2519 flushed = flushbufqueues(vp, target, 0); 2520 if (flushed == 0) { 2521 /* 2522 * Could not find any buffers without rollback 2523 * dependencies, so just write the first one 2524 * in the hopes of eventually making progress. 2525 */ 2526 if (vp != NULL && target > 2) 2527 target /= 2; 2528 flushbufqueues(vp, target, 1); 2529 } 2530 return (flushed); 2531 } 2532 2533 static void 2534 buf_daemon() 2535 { 2536 int lodirty; 2537 2538 /* 2539 * This process needs to be suspended prior to shutdown sync. 2540 */ 2541 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, 2542 SHUTDOWN_PRI_LAST); 2543 2544 /* 2545 * This process is allowed to take the buffer cache to the limit 2546 */ 2547 curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; 2548 mtx_lock(&bdlock); 2549 for (;;) { 2550 bd_request = 0; 2551 mtx_unlock(&bdlock); 2552 2553 kproc_suspend_check(bufdaemonproc); 2554 lodirty = lodirtybuffers; 2555 if (bd_speedupreq) { 2556 lodirty = numdirtybuffers / 2; 2557 bd_speedupreq = 0; 2558 } 2559 /* 2560 * Do the flush. Limit the amount of in-transit I/O we 2561 * allow to build up, otherwise we would completely saturate 2562 * the I/O system. 2563 */ 2564 while (numdirtybuffers > lodirty) { 2565 if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) 2566 break; 2567 kern_yield(PRI_USER); 2568 } 2569 2570 /* 2571 * Only clear bd_request if we have reached our low water 2572 * mark. The buf_daemon normally waits 1 second and 2573 * then incrementally flushes any dirty buffers that have 2574 * built up, within reason. 2575 * 2576 * If we were unable to hit our low water mark and couldn't 2577 * find any flushable buffers, we sleep for a short period 2578 * to avoid endless loops on unlockable buffers. 2579 */ 2580 mtx_lock(&bdlock); 2581 if (numdirtybuffers <= lodirtybuffers) { 2582 /* 2583 * We reached our low water mark, reset the 2584 * request and sleep until we are needed again. 2585 * The sleep is just so the suspend code works. 2586 */ 2587 bd_request = 0; 2588 /* 2589 * Do an extra wakeup in case dirty threshold 2590 * changed via sysctl and the explicit transition 2591 * out of shortfall was missed. 2592 */ 2593 bdirtywakeup(); 2594 if (runningbufspace <= lorunningspace) 2595 runningwakeup(); 2596 msleep(&bd_request, &bdlock, PVM, "psleep", hz); 2597 } else { 2598 /* 2599 * We couldn't find any flushable dirty buffers but 2600 * still have too many dirty buffers, we 2601 * have to sleep and try again. (rare) 2602 */ 2603 msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); 2604 } 2605 } 2606 } 2607 2608 /* 2609 * flushbufqueues: 2610 * 2611 * Try to flush a buffer in the dirty queue. We must be careful to 2612 * free up B_INVAL buffers instead of write them, which NFS is 2613 * particularly sensitive to. 2614 */ 2615 static int flushwithdeps = 0; 2616 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 2617 0, "Number of buffers flushed with dependecies that require rollbacks"); 2618 2619 static int 2620 flushbufqueues(struct vnode *lvp, int target, int flushdeps) 2621 { 2622 struct buf *sentinel; 2623 struct vnode *vp; 2624 struct mount *mp; 2625 struct buf *bp; 2626 int hasdeps; 2627 int flushed; 2628 int queue; 2629 2630 flushed = 0; 2631 queue = QUEUE_DIRTY; 2632 bp = NULL; 2633 sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); 2634 sentinel->b_qindex = QUEUE_SENTINEL; 2635 mtx_lock(&bqdirty); 2636 TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); 2637 while (flushed != target) { 2638 bp = TAILQ_NEXT(sentinel, b_freelist); 2639 if (bp != NULL) { 2640 TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); 2641 TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, 2642 b_freelist); 2643 } else 2644 break; 2645 /* 2646 * Skip sentinels inserted by other invocations of the 2647 * flushbufqueues(), taking care to not reorder them. 2648 */ 2649 if (bp->b_qindex == QUEUE_SENTINEL) 2650 continue; 2651 /* 2652 * Only flush the buffers that belong to the 2653 * vnode locked by the curthread. 2654 */ 2655 if (lvp != NULL && bp->b_vp != lvp) 2656 continue; 2657 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 2658 continue; 2659 if (bp->b_pin_count > 0) { 2660 BUF_UNLOCK(bp); 2661 continue; 2662 } 2663 /* 2664 * BKGRDINPROG can only be set with the buf and bufobj 2665 * locks both held. We tolerate a race to clear it here. 2666 */ 2667 if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || 2668 (bp->b_flags & B_DELWRI) == 0) { 2669 BUF_UNLOCK(bp); 2670 continue; 2671 } 2672 if (bp->b_flags & B_INVAL) { 2673 bremfreel(bp); 2674 mtx_unlock(&bqdirty); 2675 brelse(bp); 2676 flushed++; 2677 mtx_lock(&bqdirty); 2678 continue; 2679 } 2680 2681 if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { 2682 if (flushdeps == 0) { 2683 BUF_UNLOCK(bp); 2684 continue; 2685 } 2686 hasdeps = 1; 2687 } else 2688 hasdeps = 0; 2689 /* 2690 * We must hold the lock on a vnode before writing 2691 * one of its buffers. Otherwise we may confuse, or 2692 * in the case of a snapshot vnode, deadlock the 2693 * system. 2694 * 2695 * The lock order here is the reverse of the normal 2696 * of vnode followed by buf lock. This is ok because 2697 * the NOWAIT will prevent deadlock. 2698 */ 2699 vp = bp->b_vp; 2700 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2701 BUF_UNLOCK(bp); 2702 continue; 2703 } 2704 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) { 2705 mtx_unlock(&bqdirty); 2706 CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", 2707 bp, bp->b_vp, bp->b_flags); 2708 if (curproc == bufdaemonproc) 2709 vfs_bio_awrite(bp); 2710 else { 2711 bremfree(bp); 2712 bwrite(bp); 2713 notbufdflushes++; 2714 } 2715 vn_finished_write(mp); 2716 VOP_UNLOCK(vp, 0); 2717 flushwithdeps += hasdeps; 2718 flushed++; 2719 2720 /* 2721 * Sleeping on runningbufspace while holding 2722 * vnode lock leads to deadlock. 2723 */ 2724 if (curproc == bufdaemonproc && 2725 runningbufspace > hirunningspace) 2726 waitrunningbufspace(); 2727 mtx_lock(&bqdirty); 2728 continue; 2729 } 2730 vn_finished_write(mp); 2731 BUF_UNLOCK(bp); 2732 } 2733 TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); 2734 mtx_unlock(&bqdirty); 2735 free(sentinel, M_TEMP); 2736 return (flushed); 2737 } 2738 2739 /* 2740 * Check to see if a block is currently memory resident. 2741 */ 2742 struct buf * 2743 incore(struct bufobj *bo, daddr_t blkno) 2744 { 2745 struct buf *bp; 2746 2747 BO_RLOCK(bo); 2748 bp = gbincore(bo, blkno); 2749 BO_RUNLOCK(bo); 2750 return (bp); 2751 } 2752 2753 /* 2754 * Returns true if no I/O is needed to access the 2755 * associated VM object. This is like incore except 2756 * it also hunts around in the VM system for the data. 2757 */ 2758 2759 static int 2760 inmem(struct vnode * vp, daddr_t blkno) 2761 { 2762 vm_object_t obj; 2763 vm_offset_t toff, tinc, size; 2764 vm_page_t m; 2765 vm_ooffset_t off; 2766 2767 ASSERT_VOP_LOCKED(vp, "inmem"); 2768 2769 if (incore(&vp->v_bufobj, blkno)) 2770 return 1; 2771 if (vp->v_mount == NULL) 2772 return 0; 2773 obj = vp->v_object; 2774 if (obj == NULL) 2775 return (0); 2776 2777 size = PAGE_SIZE; 2778 if (size > vp->v_mount->mnt_stat.f_iosize) 2779 size = vp->v_mount->mnt_stat.f_iosize; 2780 off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; 2781 2782 VM_OBJECT_RLOCK(obj); 2783 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 2784 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 2785 if (!m) 2786 goto notinmem; 2787 tinc = size; 2788 if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) 2789 tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); 2790 if (vm_page_is_valid(m, 2791 (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) 2792 goto notinmem; 2793 } 2794 VM_OBJECT_RUNLOCK(obj); 2795 return 1; 2796 2797 notinmem: 2798 VM_OBJECT_RUNLOCK(obj); 2799 return (0); 2800 } 2801 2802 /* 2803 * Set the dirty range for a buffer based on the status of the dirty 2804 * bits in the pages comprising the buffer. The range is limited 2805 * to the size of the buffer. 2806 * 2807 * Tell the VM system that the pages associated with this buffer 2808 * are clean. This is used for delayed writes where the data is 2809 * going to go to disk eventually without additional VM intevention. 2810 * 2811 * Note that while we only really need to clean through to b_bcount, we 2812 * just go ahead and clean through to b_bufsize. 2813 */ 2814 static void 2815 vfs_clean_pages_dirty_buf(struct buf *bp) 2816 { 2817 vm_ooffset_t foff, noff, eoff; 2818 vm_page_t m; 2819 int i; 2820 2821 if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) 2822 return; 2823 2824 foff = bp->b_offset; 2825 KASSERT(bp->b_offset != NOOFFSET, 2826 ("vfs_clean_pages_dirty_buf: no buffer offset")); 2827 2828 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 2829 vfs_drain_busy_pages(bp); 2830 vfs_setdirty_locked_object(bp); 2831 for (i = 0; i < bp->b_npages; i++) { 2832 noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 2833 eoff = noff; 2834 if (eoff > bp->b_offset + bp->b_bufsize) 2835 eoff = bp->b_offset + bp->b_bufsize; 2836 m = bp->b_pages[i]; 2837 vfs_page_set_validclean(bp, foff, m); 2838 /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ 2839 foff = noff; 2840 } 2841 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 2842 } 2843 2844 static void 2845 vfs_setdirty_locked_object(struct buf *bp) 2846 { 2847 vm_object_t object; 2848 int i; 2849 2850 object = bp->b_bufobj->bo_object; 2851 VM_OBJECT_ASSERT_WLOCKED(object); 2852 2853 /* 2854 * We qualify the scan for modified pages on whether the 2855 * object has been flushed yet. 2856 */ 2857 if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { 2858 vm_offset_t boffset; 2859 vm_offset_t eoffset; 2860 2861 /* 2862 * test the pages to see if they have been modified directly 2863 * by users through the VM system. 2864 */ 2865 for (i = 0; i < bp->b_npages; i++) 2866 vm_page_test_dirty(bp->b_pages[i]); 2867 2868 /* 2869 * Calculate the encompassing dirty range, boffset and eoffset, 2870 * (eoffset - boffset) bytes. 2871 */ 2872 2873 for (i = 0; i < bp->b_npages; i++) { 2874 if (bp->b_pages[i]->dirty) 2875 break; 2876 } 2877 boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 2878 2879 for (i = bp->b_npages - 1; i >= 0; --i) { 2880 if (bp->b_pages[i]->dirty) { 2881 break; 2882 } 2883 } 2884 eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 2885 2886 /* 2887 * Fit it to the buffer. 2888 */ 2889 2890 if (eoffset > bp->b_bcount) 2891 eoffset = bp->b_bcount; 2892 2893 /* 2894 * If we have a good dirty range, merge with the existing 2895 * dirty range. 2896 */ 2897 2898 if (boffset < eoffset) { 2899 if (bp->b_dirtyoff > boffset) 2900 bp->b_dirtyoff = boffset; 2901 if (bp->b_dirtyend < eoffset) 2902 bp->b_dirtyend = eoffset; 2903 } 2904 } 2905 } 2906 2907 /* 2908 * Allocate the KVA mapping for an existing buffer. It handles the 2909 * cases of both B_UNMAPPED buffer, and buffer with the preallocated 2910 * KVA which is not mapped (B_KVAALLOC). 2911 */ 2912 static void 2913 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) 2914 { 2915 struct buf *scratch_bp; 2916 int bsize, maxsize, need_mapping, need_kva; 2917 off_t offset; 2918 2919 need_mapping = (bp->b_flags & B_UNMAPPED) != 0 && 2920 (gbflags & GB_UNMAPPED) == 0; 2921 need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED && 2922 (gbflags & GB_KVAALLOC) != 0; 2923 if (!need_mapping && !need_kva) 2924 return; 2925 2926 BUF_CHECK_UNMAPPED(bp); 2927 2928 if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) { 2929 /* 2930 * Buffer is not mapped, but the KVA was already 2931 * reserved at the time of the instantiation. Use the 2932 * allocated space. 2933 */ 2934 bp->b_flags &= ~B_KVAALLOC; 2935 KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0")); 2936 bp->b_kvabase = bp->b_kvaalloc; 2937 atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize); 2938 goto has_addr; 2939 } 2940 2941 /* 2942 * Calculate the amount of the address space we would reserve 2943 * if the buffer was mapped. 2944 */ 2945 bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; 2946 offset = blkno * bsize; 2947 maxsize = size + (offset & PAGE_MASK); 2948 maxsize = imax(maxsize, bsize); 2949 2950 mapping_loop: 2951 if (allocbufkva(bp, maxsize, gbflags)) { 2952 /* 2953 * Request defragmentation. getnewbuf() returns us the 2954 * allocated space by the scratch buffer KVA. 2955 */ 2956 scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags | 2957 (GB_UNMAPPED | GB_KVAALLOC)); 2958 if (scratch_bp == NULL) { 2959 if ((gbflags & GB_NOWAIT_BD) != 0) { 2960 /* 2961 * XXXKIB: defragmentation cannot 2962 * succeed, not sure what else to do. 2963 */ 2964 panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp); 2965 } 2966 atomic_add_int(&mappingrestarts, 1); 2967 goto mapping_loop; 2968 } 2969 KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0, 2970 ("scratch bp !B_KVAALLOC %p", scratch_bp)); 2971 setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc, 2972 scratch_bp->b_kvasize, gbflags); 2973 2974 /* Get rid of the scratch buffer. */ 2975 scratch_bp->b_kvasize = 0; 2976 scratch_bp->b_flags |= B_INVAL; 2977 scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC); 2978 brelse(scratch_bp); 2979 } 2980 if (!need_mapping) 2981 return; 2982 2983 has_addr: 2984 bp->b_saveaddr = bp->b_kvabase; 2985 bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */ 2986 bp->b_flags &= ~B_UNMAPPED; 2987 BUF_CHECK_MAPPED(bp); 2988 bpmap_qenter(bp); 2989 } 2990 2991 /* 2992 * getblk: 2993 * 2994 * Get a block given a specified block and offset into a file/device. 2995 * The buffers B_DONE bit will be cleared on return, making it almost 2996 * ready for an I/O initiation. B_INVAL may or may not be set on 2997 * return. The caller should clear B_INVAL prior to initiating a 2998 * READ. 2999 * 3000 * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 3001 * an existing buffer. 3002 * 3003 * For a VMIO buffer, B_CACHE is modified according to the backing VM. 3004 * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 3005 * and then cleared based on the backing VM. If the previous buffer is 3006 * non-0-sized but invalid, B_CACHE will be cleared. 3007 * 3008 * If getblk() must create a new buffer, the new buffer is returned with 3009 * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 3010 * case it is returned with B_INVAL clear and B_CACHE set based on the 3011 * backing VM. 3012 * 3013 * getblk() also forces a bwrite() for any B_DELWRI buffer whos 3014 * B_CACHE bit is clear. 3015 * 3016 * What this means, basically, is that the caller should use B_CACHE to 3017 * determine whether the buffer is fully valid or not and should clear 3018 * B_INVAL prior to issuing a read. If the caller intends to validate 3019 * the buffer by loading its data area with something, the caller needs 3020 * to clear B_INVAL. If the caller does this without issuing an I/O, 3021 * the caller should set B_CACHE ( as an optimization ), else the caller 3022 * should issue the I/O and biodone() will set B_CACHE if the I/O was 3023 * a write attempt or if it was a successfull read. If the caller 3024 * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR 3025 * prior to issuing the READ. biodone() will *not* clear B_INVAL. 3026 */ 3027 struct buf * 3028 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, 3029 int flags) 3030 { 3031 struct buf *bp; 3032 struct bufobj *bo; 3033 int bsize, error, maxsize, vmio; 3034 off_t offset; 3035 3036 CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); 3037 KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, 3038 ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); 3039 ASSERT_VOP_LOCKED(vp, "getblk"); 3040 if (size > MAXBSIZE) 3041 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); 3042 if (!unmapped_buf_allowed) 3043 flags &= ~(GB_UNMAPPED | GB_KVAALLOC); 3044 3045 bo = &vp->v_bufobj; 3046 loop: 3047 BO_RLOCK(bo); 3048 bp = gbincore(bo, blkno); 3049 if (bp != NULL) { 3050 int lockflags; 3051 /* 3052 * Buffer is in-core. If the buffer is not busy nor managed, 3053 * it must be on a queue. 3054 */ 3055 lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; 3056 3057 if (flags & GB_LOCK_NOWAIT) 3058 lockflags |= LK_NOWAIT; 3059 3060 error = BUF_TIMELOCK(bp, lockflags, 3061 BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); 3062 3063 /* 3064 * If we slept and got the lock we have to restart in case 3065 * the buffer changed identities. 3066 */ 3067 if (error == ENOLCK) 3068 goto loop; 3069 /* We timed out or were interrupted. */ 3070 else if (error) 3071 return (NULL); 3072 /* If recursed, assume caller knows the rules. */ 3073 else if (BUF_LOCKRECURSED(bp)) 3074 goto end; 3075 3076 /* 3077 * The buffer is locked. B_CACHE is cleared if the buffer is 3078 * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set 3079 * and for a VMIO buffer B_CACHE is adjusted according to the 3080 * backing VM cache. 3081 */ 3082 if (bp->b_flags & B_INVAL) 3083 bp->b_flags &= ~B_CACHE; 3084 else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) 3085 bp->b_flags |= B_CACHE; 3086 if (bp->b_flags & B_MANAGED) 3087 MPASS(bp->b_qindex == QUEUE_NONE); 3088 else 3089 bremfree(bp); 3090 3091 /* 3092 * check for size inconsistencies for non-VMIO case. 3093 */ 3094 if (bp->b_bcount != size) { 3095 if ((bp->b_flags & B_VMIO) == 0 || 3096 (size > bp->b_kvasize)) { 3097 if (bp->b_flags & B_DELWRI) { 3098 /* 3099 * If buffer is pinned and caller does 3100 * not want sleep waiting for it to be 3101 * unpinned, bail out 3102 * */ 3103 if (bp->b_pin_count > 0) { 3104 if (flags & GB_LOCK_NOWAIT) { 3105 bqrelse(bp); 3106 return (NULL); 3107 } else { 3108 bunpin_wait(bp); 3109 } 3110 } 3111 bp->b_flags |= B_NOCACHE; 3112 bwrite(bp); 3113 } else { 3114 if (LIST_EMPTY(&bp->b_dep)) { 3115 bp->b_flags |= B_RELBUF; 3116 brelse(bp); 3117 } else { 3118 bp->b_flags |= B_NOCACHE; 3119 bwrite(bp); 3120 } 3121 } 3122 goto loop; 3123 } 3124 } 3125 3126 /* 3127 * Handle the case of unmapped buffer which should 3128 * become mapped, or the buffer for which KVA 3129 * reservation is requested. 3130 */ 3131 bp_unmapped_get_kva(bp, blkno, size, flags); 3132 3133 /* 3134 * If the size is inconsistant in the VMIO case, we can resize 3135 * the buffer. This might lead to B_CACHE getting set or 3136 * cleared. If the size has not changed, B_CACHE remains 3137 * unchanged from its previous state. 3138 */ 3139 if (bp->b_bcount != size) 3140 allocbuf(bp, size); 3141 3142 KASSERT(bp->b_offset != NOOFFSET, 3143 ("getblk: no buffer offset")); 3144 3145 /* 3146 * A buffer with B_DELWRI set and B_CACHE clear must 3147 * be committed before we can return the buffer in 3148 * order to prevent the caller from issuing a read 3149 * ( due to B_CACHE not being set ) and overwriting 3150 * it. 3151 * 3152 * Most callers, including NFS and FFS, need this to 3153 * operate properly either because they assume they 3154 * can issue a read if B_CACHE is not set, or because 3155 * ( for example ) an uncached B_DELWRI might loop due 3156 * to softupdates re-dirtying the buffer. In the latter 3157 * case, B_CACHE is set after the first write completes, 3158 * preventing further loops. 3159 * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE 3160 * above while extending the buffer, we cannot allow the 3161 * buffer to remain with B_CACHE set after the write 3162 * completes or it will represent a corrupt state. To 3163 * deal with this we set B_NOCACHE to scrap the buffer 3164 * after the write. 3165 * 3166 * We might be able to do something fancy, like setting 3167 * B_CACHE in bwrite() except if B_DELWRI is already set, 3168 * so the below call doesn't set B_CACHE, but that gets real 3169 * confusing. This is much easier. 3170 */ 3171 3172 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 3173 bp->b_flags |= B_NOCACHE; 3174 bwrite(bp); 3175 goto loop; 3176 } 3177 bp->b_flags &= ~B_DONE; 3178 } else { 3179 /* 3180 * Buffer is not in-core, create new buffer. The buffer 3181 * returned by getnewbuf() is locked. Note that the returned 3182 * buffer is also considered valid (not marked B_INVAL). 3183 */ 3184 BO_RUNLOCK(bo); 3185 /* 3186 * If the user does not want us to create the buffer, bail out 3187 * here. 3188 */ 3189 if (flags & GB_NOCREAT) 3190 return NULL; 3191 if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) 3192 return NULL; 3193 3194 bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; 3195 offset = blkno * bsize; 3196 vmio = vp->v_object != NULL; 3197 if (vmio) { 3198 maxsize = size + (offset & PAGE_MASK); 3199 } else { 3200 maxsize = size; 3201 /* Do not allow non-VMIO notmapped buffers. */ 3202 flags &= ~GB_UNMAPPED; 3203 } 3204 maxsize = imax(maxsize, bsize); 3205 3206 bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags); 3207 if (bp == NULL) { 3208 if (slpflag || slptimeo) 3209 return NULL; 3210 goto loop; 3211 } 3212 3213 /* 3214 * This code is used to make sure that a buffer is not 3215 * created while the getnewbuf routine is blocked. 3216 * This can be a problem whether the vnode is locked or not. 3217 * If the buffer is created out from under us, we have to 3218 * throw away the one we just created. 3219 * 3220 * Note: this must occur before we associate the buffer 3221 * with the vp especially considering limitations in 3222 * the splay tree implementation when dealing with duplicate 3223 * lblkno's. 3224 */ 3225 BO_LOCK(bo); 3226 if (gbincore(bo, blkno)) { 3227 BO_UNLOCK(bo); 3228 bp->b_flags |= B_INVAL; 3229 brelse(bp); 3230 goto loop; 3231 } 3232 3233 /* 3234 * Insert the buffer into the hash, so that it can 3235 * be found by incore. 3236 */ 3237 bp->b_blkno = bp->b_lblkno = blkno; 3238 bp->b_offset = offset; 3239 bgetvp(vp, bp); 3240 BO_UNLOCK(bo); 3241 3242 /* 3243 * set B_VMIO bit. allocbuf() the buffer bigger. Since the 3244 * buffer size starts out as 0, B_CACHE will be set by 3245 * allocbuf() for the VMIO case prior to it testing the 3246 * backing store for validity. 3247 */ 3248 3249 if (vmio) { 3250 bp->b_flags |= B_VMIO; 3251 KASSERT(vp->v_object == bp->b_bufobj->bo_object, 3252 ("ARGH! different b_bufobj->bo_object %p %p %p\n", 3253 bp, vp->v_object, bp->b_bufobj->bo_object)); 3254 } else { 3255 bp->b_flags &= ~B_VMIO; 3256 KASSERT(bp->b_bufobj->bo_object == NULL, 3257 ("ARGH! has b_bufobj->bo_object %p %p\n", 3258 bp, bp->b_bufobj->bo_object)); 3259 BUF_CHECK_MAPPED(bp); 3260 } 3261 3262 allocbuf(bp, size); 3263 bp->b_flags &= ~B_DONE; 3264 } 3265 CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); 3266 BUF_ASSERT_HELD(bp); 3267 end: 3268 KASSERT(bp->b_bufobj == bo, 3269 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3270 return (bp); 3271 } 3272 3273 /* 3274 * Get an empty, disassociated buffer of given size. The buffer is initially 3275 * set to B_INVAL. 3276 */ 3277 struct buf * 3278 geteblk(int size, int flags) 3279 { 3280 struct buf *bp; 3281 int maxsize; 3282 3283 maxsize = (size + BKVAMASK) & ~BKVAMASK; 3284 while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) { 3285 if ((flags & GB_NOWAIT_BD) && 3286 (curthread->td_pflags & TDP_BUFNEED) != 0) 3287 return (NULL); 3288 } 3289 allocbuf(bp, size); 3290 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 3291 BUF_ASSERT_HELD(bp); 3292 return (bp); 3293 } 3294 3295 3296 /* 3297 * This code constitutes the buffer memory from either anonymous system 3298 * memory (in the case of non-VMIO operations) or from an associated 3299 * VM object (in the case of VMIO operations). This code is able to 3300 * resize a buffer up or down. 3301 * 3302 * Note that this code is tricky, and has many complications to resolve 3303 * deadlock or inconsistant data situations. Tread lightly!!! 3304 * There are B_CACHE and B_DELWRI interactions that must be dealt with by 3305 * the caller. Calling this code willy nilly can result in the loss of data. 3306 * 3307 * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 3308 * B_CACHE for the non-VMIO case. 3309 */ 3310 3311 int 3312 allocbuf(struct buf *bp, int size) 3313 { 3314 int newbsize, mbsize; 3315 int i; 3316 3317 BUF_ASSERT_HELD(bp); 3318 3319 if (bp->b_kvasize < size) 3320 panic("allocbuf: buffer too small"); 3321 3322 if ((bp->b_flags & B_VMIO) == 0) { 3323 caddr_t origbuf; 3324 int origbufsize; 3325 /* 3326 * Just get anonymous memory from the kernel. Don't 3327 * mess with B_CACHE. 3328 */ 3329 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 3330 if (bp->b_flags & B_MALLOC) 3331 newbsize = mbsize; 3332 else 3333 newbsize = round_page(size); 3334 3335 if (newbsize < bp->b_bufsize) { 3336 /* 3337 * malloced buffers are not shrunk 3338 */ 3339 if (bp->b_flags & B_MALLOC) { 3340 if (newbsize) { 3341 bp->b_bcount = size; 3342 } else { 3343 free(bp->b_data, M_BIOBUF); 3344 if (bp->b_bufsize) { 3345 atomic_subtract_long( 3346 &bufmallocspace, 3347 bp->b_bufsize); 3348 bufspacewakeup(); 3349 bp->b_bufsize = 0; 3350 } 3351 bp->b_saveaddr = bp->b_kvabase; 3352 bp->b_data = bp->b_saveaddr; 3353 bp->b_bcount = 0; 3354 bp->b_flags &= ~B_MALLOC; 3355 } 3356 return 1; 3357 } 3358 vm_hold_free_pages(bp, newbsize); 3359 } else if (newbsize > bp->b_bufsize) { 3360 /* 3361 * We only use malloced memory on the first allocation. 3362 * and revert to page-allocated memory when the buffer 3363 * grows. 3364 */ 3365 /* 3366 * There is a potential smp race here that could lead 3367 * to bufmallocspace slightly passing the max. It 3368 * is probably extremely rare and not worth worrying 3369 * over. 3370 */ 3371 if ( (bufmallocspace < maxbufmallocspace) && 3372 (bp->b_bufsize == 0) && 3373 (mbsize <= PAGE_SIZE/2)) { 3374 3375 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); 3376 bp->b_bufsize = mbsize; 3377 bp->b_bcount = size; 3378 bp->b_flags |= B_MALLOC; 3379 atomic_add_long(&bufmallocspace, mbsize); 3380 return 1; 3381 } 3382 origbuf = NULL; 3383 origbufsize = 0; 3384 /* 3385 * If the buffer is growing on its other-than-first allocation, 3386 * then we revert to the page-allocation scheme. 3387 */ 3388 if (bp->b_flags & B_MALLOC) { 3389 origbuf = bp->b_data; 3390 origbufsize = bp->b_bufsize; 3391 bp->b_data = bp->b_kvabase; 3392 if (bp->b_bufsize) { 3393 atomic_subtract_long(&bufmallocspace, 3394 bp->b_bufsize); 3395 bufspacewakeup(); 3396 bp->b_bufsize = 0; 3397 } 3398 bp->b_flags &= ~B_MALLOC; 3399 newbsize = round_page(newbsize); 3400 } 3401 vm_hold_load_pages( 3402 bp, 3403 (vm_offset_t) bp->b_data + bp->b_bufsize, 3404 (vm_offset_t) bp->b_data + newbsize); 3405 if (origbuf) { 3406 bcopy(origbuf, bp->b_data, origbufsize); 3407 free(origbuf, M_BIOBUF); 3408 } 3409 } 3410 } else { 3411 int desiredpages; 3412 3413 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 3414 desiredpages = (size == 0) ? 0 : 3415 num_pages((bp->b_offset & PAGE_MASK) + newbsize); 3416 3417 if (bp->b_flags & B_MALLOC) 3418 panic("allocbuf: VMIO buffer can't be malloced"); 3419 /* 3420 * Set B_CACHE initially if buffer is 0 length or will become 3421 * 0-length. 3422 */ 3423 if (size == 0 || bp->b_bufsize == 0) 3424 bp->b_flags |= B_CACHE; 3425 3426 if (newbsize < bp->b_bufsize) { 3427 /* 3428 * DEV_BSIZE aligned new buffer size is less then the 3429 * DEV_BSIZE aligned existing buffer size. Figure out 3430 * if we have to remove any pages. 3431 */ 3432 if (desiredpages < bp->b_npages) { 3433 vm_page_t m; 3434 3435 if ((bp->b_flags & B_UNMAPPED) == 0) { 3436 BUF_CHECK_MAPPED(bp); 3437 pmap_qremove((vm_offset_t)trunc_page( 3438 (vm_offset_t)bp->b_data) + 3439 (desiredpages << PAGE_SHIFT), 3440 (bp->b_npages - desiredpages)); 3441 } else 3442 BUF_CHECK_UNMAPPED(bp); 3443 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 3444 for (i = desiredpages; i < bp->b_npages; i++) { 3445 /* 3446 * the page is not freed here -- it 3447 * is the responsibility of 3448 * vnode_pager_setsize 3449 */ 3450 m = bp->b_pages[i]; 3451 KASSERT(m != bogus_page, 3452 ("allocbuf: bogus page found")); 3453 while (vm_page_sleep_if_busy(m, TRUE, 3454 "biodep")) 3455 continue; 3456 3457 bp->b_pages[i] = NULL; 3458 vm_page_lock(m); 3459 vm_page_unwire(m, 0); 3460 vm_page_unlock(m); 3461 } 3462 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 3463 bp->b_npages = desiredpages; 3464 } 3465 } else if (size > bp->b_bcount) { 3466 /* 3467 * We are growing the buffer, possibly in a 3468 * byte-granular fashion. 3469 */ 3470 vm_object_t obj; 3471 vm_offset_t toff; 3472 vm_offset_t tinc; 3473 3474 /* 3475 * Step 1, bring in the VM pages from the object, 3476 * allocating them if necessary. We must clear 3477 * B_CACHE if these pages are not valid for the 3478 * range covered by the buffer. 3479 */ 3480 3481 obj = bp->b_bufobj->bo_object; 3482 3483 VM_OBJECT_WLOCK(obj); 3484 while (bp->b_npages < desiredpages) { 3485 vm_page_t m; 3486 3487 /* 3488 * We must allocate system pages since blocking 3489 * here could interfere with paging I/O, no 3490 * matter which process we are. 3491 * 3492 * We can only test VPO_BUSY here. Blocking on 3493 * m->busy might lead to a deadlock: 3494 * vm_fault->getpages->cluster_read->allocbuf 3495 * Thus, we specify VM_ALLOC_IGN_SBUSY. 3496 */ 3497 m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + 3498 bp->b_npages, VM_ALLOC_NOBUSY | 3499 VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | 3500 VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY | 3501 VM_ALLOC_COUNT(desiredpages - bp->b_npages)); 3502 if (m->valid == 0) 3503 bp->b_flags &= ~B_CACHE; 3504 bp->b_pages[bp->b_npages] = m; 3505 ++bp->b_npages; 3506 } 3507 3508 /* 3509 * Step 2. We've loaded the pages into the buffer, 3510 * we have to figure out if we can still have B_CACHE 3511 * set. Note that B_CACHE is set according to the 3512 * byte-granular range ( bcount and size ), new the 3513 * aligned range ( newbsize ). 3514 * 3515 * The VM test is against m->valid, which is DEV_BSIZE 3516 * aligned. Needless to say, the validity of the data 3517 * needs to also be DEV_BSIZE aligned. Note that this 3518 * fails with NFS if the server or some other client 3519 * extends the file's EOF. If our buffer is resized, 3520 * B_CACHE may remain set! XXX 3521 */ 3522 3523 toff = bp->b_bcount; 3524 tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); 3525 3526 while ((bp->b_flags & B_CACHE) && toff < size) { 3527 vm_pindex_t pi; 3528 3529 if (tinc > (size - toff)) 3530 tinc = size - toff; 3531 3532 pi = ((bp->b_offset & PAGE_MASK) + toff) >> 3533 PAGE_SHIFT; 3534 3535 vfs_buf_test_cache( 3536 bp, 3537 bp->b_offset, 3538 toff, 3539 tinc, 3540 bp->b_pages[pi] 3541 ); 3542 toff += tinc; 3543 tinc = PAGE_SIZE; 3544 } 3545 VM_OBJECT_WUNLOCK(obj); 3546 3547 /* 3548 * Step 3, fixup the KVM pmap. 3549 */ 3550 if ((bp->b_flags & B_UNMAPPED) == 0) 3551 bpmap_qenter(bp); 3552 else 3553 BUF_CHECK_UNMAPPED(bp); 3554 } 3555 } 3556 if (newbsize < bp->b_bufsize) 3557 bufspacewakeup(); 3558 bp->b_bufsize = newbsize; /* actual buffer allocation */ 3559 bp->b_bcount = size; /* requested buffer size */ 3560 return 1; 3561 } 3562 3563 extern int inflight_transient_maps; 3564 3565 void 3566 biodone(struct bio *bp) 3567 { 3568 struct mtx *mtxp; 3569 void (*done)(struct bio *); 3570 vm_offset_t start, end; 3571 int transient; 3572 3573 mtxp = mtx_pool_find(mtxpool_sleep, bp); 3574 mtx_lock(mtxp); 3575 bp->bio_flags |= BIO_DONE; 3576 if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { 3577 start = trunc_page((vm_offset_t)bp->bio_data); 3578 end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); 3579 transient = 1; 3580 } else { 3581 transient = 0; 3582 start = end = 0; 3583 } 3584 done = bp->bio_done; 3585 if (done == NULL) 3586 wakeup(bp); 3587 mtx_unlock(mtxp); 3588 if (done != NULL) 3589 done(bp); 3590 if (transient) { 3591 pmap_qremove(start, OFF_TO_IDX(end - start)); 3592 vmem_free(transient_arena, start, end - start); 3593 atomic_add_int(&inflight_transient_maps, -1); 3594 } 3595 } 3596 3597 /* 3598 * Wait for a BIO to finish. 3599 * 3600 * XXX: resort to a timeout for now. The optimal locking (if any) for this 3601 * case is not yet clear. 3602 */ 3603 int 3604 biowait(struct bio *bp, const char *wchan) 3605 { 3606 struct mtx *mtxp; 3607 3608 mtxp = mtx_pool_find(mtxpool_sleep, bp); 3609 mtx_lock(mtxp); 3610 while ((bp->bio_flags & BIO_DONE) == 0) 3611 msleep(bp, mtxp, PRIBIO, wchan, hz / 10); 3612 mtx_unlock(mtxp); 3613 if (bp->bio_error != 0) 3614 return (bp->bio_error); 3615 if (!(bp->bio_flags & BIO_ERROR)) 3616 return (0); 3617 return (EIO); 3618 } 3619 3620 void 3621 biofinish(struct bio *bp, struct devstat *stat, int error) 3622 { 3623 3624 if (error) { 3625 bp->bio_error = error; 3626 bp->bio_flags |= BIO_ERROR; 3627 } 3628 if (stat != NULL) 3629 devstat_end_transaction_bio(stat, bp); 3630 biodone(bp); 3631 } 3632 3633 /* 3634 * bufwait: 3635 * 3636 * Wait for buffer I/O completion, returning error status. The buffer 3637 * is left locked and B_DONE on return. B_EINTR is converted into an EINTR 3638 * error and cleared. 3639 */ 3640 int 3641 bufwait(struct buf *bp) 3642 { 3643 if (bp->b_iocmd == BIO_READ) 3644 bwait(bp, PRIBIO, "biord"); 3645 else 3646 bwait(bp, PRIBIO, "biowr"); 3647 if (bp->b_flags & B_EINTR) { 3648 bp->b_flags &= ~B_EINTR; 3649 return (EINTR); 3650 } 3651 if (bp->b_ioflags & BIO_ERROR) { 3652 return (bp->b_error ? bp->b_error : EIO); 3653 } else { 3654 return (0); 3655 } 3656 } 3657 3658 /* 3659 * Call back function from struct bio back up to struct buf. 3660 */ 3661 static void 3662 bufdonebio(struct bio *bip) 3663 { 3664 struct buf *bp; 3665 3666 bp = bip->bio_caller2; 3667 bp->b_resid = bp->b_bcount - bip->bio_completed; 3668 bp->b_resid = bip->bio_resid; /* XXX: remove */ 3669 bp->b_ioflags = bip->bio_flags; 3670 bp->b_error = bip->bio_error; 3671 if (bp->b_error) 3672 bp->b_ioflags |= BIO_ERROR; 3673 bufdone(bp); 3674 g_destroy_bio(bip); 3675 } 3676 3677 void 3678 dev_strategy(struct cdev *dev, struct buf *bp) 3679 { 3680 struct cdevsw *csw; 3681 int ref; 3682 3683 KASSERT(dev->si_refcount > 0, 3684 ("dev_strategy on un-referenced struct cdev *(%s) %p", 3685 devtoname(dev), dev)); 3686 3687 csw = dev_refthread(dev, &ref); 3688 dev_strategy_csw(dev, csw, bp); 3689 dev_relthread(dev, ref); 3690 } 3691 3692 void 3693 dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp) 3694 { 3695 struct bio *bip; 3696 3697 KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE, 3698 ("b_iocmd botch")); 3699 KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) || 3700 dev->si_threadcount > 0, 3701 ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev), 3702 dev)); 3703 if (csw == NULL) { 3704 bp->b_error = ENXIO; 3705 bp->b_ioflags = BIO_ERROR; 3706 bufdone(bp); 3707 return; 3708 } 3709 for (;;) { 3710 bip = g_new_bio(); 3711 if (bip != NULL) 3712 break; 3713 /* Try again later */ 3714 tsleep(&bp, PRIBIO, "dev_strat", hz/10); 3715 } 3716 bip->bio_cmd = bp->b_iocmd; 3717 bip->bio_offset = bp->b_iooffset; 3718 bip->bio_length = bp->b_bcount; 3719 bip->bio_bcount = bp->b_bcount; /* XXX: remove */ 3720 bdata2bio(bp, bip); 3721 bip->bio_done = bufdonebio; 3722 bip->bio_caller2 = bp; 3723 bip->bio_dev = dev; 3724 (*csw->d_strategy)(bip); 3725 } 3726 3727 /* 3728 * bufdone: 3729 * 3730 * Finish I/O on a buffer, optionally calling a completion function. 3731 * This is usually called from an interrupt so process blocking is 3732 * not allowed. 3733 * 3734 * biodone is also responsible for setting B_CACHE in a B_VMIO bp. 3735 * In a non-VMIO bp, B_CACHE will be set on the next getblk() 3736 * assuming B_INVAL is clear. 3737 * 3738 * For the VMIO case, we set B_CACHE if the op was a read and no 3739 * read error occured, or if the op was a write. B_CACHE is never 3740 * set if the buffer is invalid or otherwise uncacheable. 3741 * 3742 * biodone does not mess with B_INVAL, allowing the I/O routine or the 3743 * initiator to leave B_INVAL set to brelse the buffer out of existance 3744 * in the biodone routine. 3745 */ 3746 void 3747 bufdone(struct buf *bp) 3748 { 3749 struct bufobj *dropobj; 3750 void (*biodone)(struct buf *); 3751 3752 CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 3753 dropobj = NULL; 3754 3755 KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); 3756 BUF_ASSERT_HELD(bp); 3757 3758 runningbufwakeup(bp); 3759 if (bp->b_iocmd == BIO_WRITE) 3760 dropobj = bp->b_bufobj; 3761 /* call optional completion function if requested */ 3762 if (bp->b_iodone != NULL) { 3763 biodone = bp->b_iodone; 3764 bp->b_iodone = NULL; 3765 (*biodone) (bp); 3766 if (dropobj) 3767 bufobj_wdrop(dropobj); 3768 return; 3769 } 3770 3771 bufdone_finish(bp); 3772 3773 if (dropobj) 3774 bufobj_wdrop(dropobj); 3775 } 3776 3777 void 3778 bufdone_finish(struct buf *bp) 3779 { 3780 BUF_ASSERT_HELD(bp); 3781 3782 if (!LIST_EMPTY(&bp->b_dep)) 3783 buf_complete(bp); 3784 3785 if (bp->b_flags & B_VMIO) { 3786 vm_ooffset_t foff; 3787 vm_page_t m; 3788 vm_object_t obj; 3789 struct vnode *vp; 3790 int bogus, i, iosize; 3791 3792 obj = bp->b_bufobj->bo_object; 3793 KASSERT(obj->paging_in_progress >= bp->b_npages, 3794 ("biodone_finish: paging in progress(%d) < b_npages(%d)", 3795 obj->paging_in_progress, bp->b_npages)); 3796 3797 vp = bp->b_vp; 3798 KASSERT(vp->v_holdcnt > 0, 3799 ("biodone_finish: vnode %p has zero hold count", vp)); 3800 KASSERT(vp->v_object != NULL, 3801 ("biodone_finish: vnode %p has no vm_object", vp)); 3802 3803 foff = bp->b_offset; 3804 KASSERT(bp->b_offset != NOOFFSET, 3805 ("biodone_finish: bp %p has no buffer offset", bp)); 3806 3807 /* 3808 * Set B_CACHE if the op was a normal read and no error 3809 * occured. B_CACHE is set for writes in the b*write() 3810 * routines. 3811 */ 3812 iosize = bp->b_bcount - bp->b_resid; 3813 if (bp->b_iocmd == BIO_READ && 3814 !(bp->b_flags & (B_INVAL|B_NOCACHE)) && 3815 !(bp->b_ioflags & BIO_ERROR)) { 3816 bp->b_flags |= B_CACHE; 3817 } 3818 bogus = 0; 3819 VM_OBJECT_WLOCK(obj); 3820 for (i = 0; i < bp->b_npages; i++) { 3821 int bogusflag = 0; 3822 int resid; 3823 3824 resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; 3825 if (resid > iosize) 3826 resid = iosize; 3827 3828 /* 3829 * cleanup bogus pages, restoring the originals 3830 */ 3831 m = bp->b_pages[i]; 3832 if (m == bogus_page) { 3833 bogus = bogusflag = 1; 3834 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 3835 if (m == NULL) 3836 panic("biodone: page disappeared!"); 3837 bp->b_pages[i] = m; 3838 } 3839 KASSERT(OFF_TO_IDX(foff) == m->pindex, 3840 ("biodone_finish: foff(%jd)/pindex(%ju) mismatch", 3841 (intmax_t)foff, (uintmax_t)m->pindex)); 3842 3843 /* 3844 * In the write case, the valid and clean bits are 3845 * already changed correctly ( see bdwrite() ), so we 3846 * only need to do this here in the read case. 3847 */ 3848 if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { 3849 KASSERT((m->dirty & vm_page_bits(foff & 3850 PAGE_MASK, resid)) == 0, ("bufdone_finish:" 3851 " page %p has unexpected dirty bits", m)); 3852 vfs_page_set_valid(bp, foff, m); 3853 } 3854 3855 vm_page_io_finish(m); 3856 vm_object_pip_subtract(obj, 1); 3857 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3858 iosize -= resid; 3859 } 3860 vm_object_pip_wakeupn(obj, 0); 3861 VM_OBJECT_WUNLOCK(obj); 3862 if (bogus && (bp->b_flags & B_UNMAPPED) == 0) { 3863 BUF_CHECK_MAPPED(bp); 3864 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 3865 bp->b_pages, bp->b_npages); 3866 } 3867 } 3868 3869 /* 3870 * For asynchronous completions, release the buffer now. The brelse 3871 * will do a wakeup there if necessary - so no need to do a wakeup 3872 * here in the async case. The sync case always needs to do a wakeup. 3873 */ 3874 3875 if (bp->b_flags & B_ASYNC) { 3876 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) 3877 brelse(bp); 3878 else 3879 bqrelse(bp); 3880 } else 3881 bdone(bp); 3882 } 3883 3884 /* 3885 * This routine is called in lieu of iodone in the case of 3886 * incomplete I/O. This keeps the busy status for pages 3887 * consistant. 3888 */ 3889 void 3890 vfs_unbusy_pages(struct buf *bp) 3891 { 3892 int i; 3893 vm_object_t obj; 3894 vm_page_t m; 3895 3896 runningbufwakeup(bp); 3897 if (!(bp->b_flags & B_VMIO)) 3898 return; 3899 3900 obj = bp->b_bufobj->bo_object; 3901 VM_OBJECT_WLOCK(obj); 3902 for (i = 0; i < bp->b_npages; i++) { 3903 m = bp->b_pages[i]; 3904 if (m == bogus_page) { 3905 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); 3906 if (!m) 3907 panic("vfs_unbusy_pages: page missing\n"); 3908 bp->b_pages[i] = m; 3909 if ((bp->b_flags & B_UNMAPPED) == 0) { 3910 BUF_CHECK_MAPPED(bp); 3911 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 3912 bp->b_pages, bp->b_npages); 3913 } else 3914 BUF_CHECK_UNMAPPED(bp); 3915 } 3916 vm_object_pip_subtract(obj, 1); 3917 vm_page_io_finish(m); 3918 } 3919 vm_object_pip_wakeupn(obj, 0); 3920 VM_OBJECT_WUNLOCK(obj); 3921 } 3922 3923 /* 3924 * vfs_page_set_valid: 3925 * 3926 * Set the valid bits in a page based on the supplied offset. The 3927 * range is restricted to the buffer's size. 3928 * 3929 * This routine is typically called after a read completes. 3930 */ 3931 static void 3932 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) 3933 { 3934 vm_ooffset_t eoff; 3935 3936 /* 3937 * Compute the end offset, eoff, such that [off, eoff) does not span a 3938 * page boundary and eoff is not greater than the end of the buffer. 3939 * The end of the buffer, in this case, is our file EOF, not the 3940 * allocation size of the buffer. 3941 */ 3942 eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; 3943 if (eoff > bp->b_offset + bp->b_bcount) 3944 eoff = bp->b_offset + bp->b_bcount; 3945 3946 /* 3947 * Set valid range. This is typically the entire buffer and thus the 3948 * entire page. 3949 */ 3950 if (eoff > off) 3951 vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); 3952 } 3953 3954 /* 3955 * vfs_page_set_validclean: 3956 * 3957 * Set the valid bits and clear the dirty bits in a page based on the 3958 * supplied offset. The range is restricted to the buffer's size. 3959 */ 3960 static void 3961 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) 3962 { 3963 vm_ooffset_t soff, eoff; 3964 3965 /* 3966 * Start and end offsets in buffer. eoff - soff may not cross a 3967 * page boundry or cross the end of the buffer. The end of the 3968 * buffer, in this case, is our file EOF, not the allocation size 3969 * of the buffer. 3970 */ 3971 soff = off; 3972 eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3973 if (eoff > bp->b_offset + bp->b_bcount) 3974 eoff = bp->b_offset + bp->b_bcount; 3975 3976 /* 3977 * Set valid range. This is typically the entire buffer and thus the 3978 * entire page. 3979 */ 3980 if (eoff > soff) { 3981 vm_page_set_validclean( 3982 m, 3983 (vm_offset_t) (soff & PAGE_MASK), 3984 (vm_offset_t) (eoff - soff) 3985 ); 3986 } 3987 } 3988 3989 /* 3990 * Ensure that all buffer pages are not busied by VPO_BUSY flag. If 3991 * any page is busy, drain the flag. 3992 */ 3993 static void 3994 vfs_drain_busy_pages(struct buf *bp) 3995 { 3996 vm_page_t m; 3997 int i, last_busied; 3998 3999 VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); 4000 last_busied = 0; 4001 for (i = 0; i < bp->b_npages; i++) { 4002 m = bp->b_pages[i]; 4003 if ((m->oflags & VPO_BUSY) != 0) { 4004 for (; last_busied < i; last_busied++) 4005 vm_page_busy(bp->b_pages[last_busied]); 4006 while ((m->oflags & VPO_BUSY) != 0) 4007 vm_page_sleep(m, "vbpage"); 4008 } 4009 } 4010 for (i = 0; i < last_busied; i++) 4011 vm_page_wakeup(bp->b_pages[i]); 4012 } 4013 4014 /* 4015 * This routine is called before a device strategy routine. 4016 * It is used to tell the VM system that paging I/O is in 4017 * progress, and treat the pages associated with the buffer 4018 * almost as being VPO_BUSY. Also the object paging_in_progress 4019 * flag is handled to make sure that the object doesn't become 4020 * inconsistant. 4021 * 4022 * Since I/O has not been initiated yet, certain buffer flags 4023 * such as BIO_ERROR or B_INVAL may be in an inconsistant state 4024 * and should be ignored. 4025 */ 4026 void 4027 vfs_busy_pages(struct buf *bp, int clear_modify) 4028 { 4029 int i, bogus; 4030 vm_object_t obj; 4031 vm_ooffset_t foff; 4032 vm_page_t m; 4033 4034 if (!(bp->b_flags & B_VMIO)) 4035 return; 4036 4037 obj = bp->b_bufobj->bo_object; 4038 foff = bp->b_offset; 4039 KASSERT(bp->b_offset != NOOFFSET, 4040 ("vfs_busy_pages: no buffer offset")); 4041 VM_OBJECT_WLOCK(obj); 4042 vfs_drain_busy_pages(bp); 4043 if (bp->b_bufsize != 0) 4044 vfs_setdirty_locked_object(bp); 4045 bogus = 0; 4046 for (i = 0; i < bp->b_npages; i++) { 4047 m = bp->b_pages[i]; 4048 4049 if ((bp->b_flags & B_CLUSTER) == 0) { 4050 vm_object_pip_add(obj, 1); 4051 vm_page_io_start(m); 4052 } 4053 /* 4054 * When readying a buffer for a read ( i.e 4055 * clear_modify == 0 ), it is important to do 4056 * bogus_page replacement for valid pages in 4057 * partially instantiated buffers. Partially 4058 * instantiated buffers can, in turn, occur when 4059 * reconstituting a buffer from its VM backing store 4060 * base. We only have to do this if B_CACHE is 4061 * clear ( which causes the I/O to occur in the 4062 * first place ). The replacement prevents the read 4063 * I/O from overwriting potentially dirty VM-backed 4064 * pages. XXX bogus page replacement is, uh, bogus. 4065 * It may not work properly with small-block devices. 4066 * We need to find a better way. 4067 */ 4068 if (clear_modify) { 4069 pmap_remove_write(m); 4070 vfs_page_set_validclean(bp, foff, m); 4071 } else if (m->valid == VM_PAGE_BITS_ALL && 4072 (bp->b_flags & B_CACHE) == 0) { 4073 bp->b_pages[i] = bogus_page; 4074 bogus++; 4075 } 4076 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 4077 } 4078 VM_OBJECT_WUNLOCK(obj); 4079 if (bogus && (bp->b_flags & B_UNMAPPED) == 0) { 4080 BUF_CHECK_MAPPED(bp); 4081 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 4082 bp->b_pages, bp->b_npages); 4083 } 4084 } 4085 4086 /* 4087 * vfs_bio_set_valid: 4088 * 4089 * Set the range within the buffer to valid. The range is 4090 * relative to the beginning of the buffer, b_offset. Note that 4091 * b_offset itself may be offset from the beginning of the first 4092 * page. 4093 */ 4094 void 4095 vfs_bio_set_valid(struct buf *bp, int base, int size) 4096 { 4097 int i, n; 4098 vm_page_t m; 4099 4100 if (!(bp->b_flags & B_VMIO)) 4101 return; 4102 4103 /* 4104 * Fixup base to be relative to beginning of first page. 4105 * Set initial n to be the maximum number of bytes in the 4106 * first page that can be validated. 4107 */ 4108 base += (bp->b_offset & PAGE_MASK); 4109 n = PAGE_SIZE - (base & PAGE_MASK); 4110 4111 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 4112 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { 4113 m = bp->b_pages[i]; 4114 if (n > size) 4115 n = size; 4116 vm_page_set_valid_range(m, base & PAGE_MASK, n); 4117 base += n; 4118 size -= n; 4119 n = PAGE_SIZE; 4120 } 4121 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 4122 } 4123 4124 /* 4125 * vfs_bio_clrbuf: 4126 * 4127 * If the specified buffer is a non-VMIO buffer, clear the entire 4128 * buffer. If the specified buffer is a VMIO buffer, clear and 4129 * validate only the previously invalid portions of the buffer. 4130 * This routine essentially fakes an I/O, so we need to clear 4131 * BIO_ERROR and B_INVAL. 4132 * 4133 * Note that while we only theoretically need to clear through b_bcount, 4134 * we go ahead and clear through b_bufsize. 4135 */ 4136 void 4137 vfs_bio_clrbuf(struct buf *bp) 4138 { 4139 int i, j, mask, sa, ea, slide; 4140 4141 if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { 4142 clrbuf(bp); 4143 return; 4144 } 4145 bp->b_flags &= ~B_INVAL; 4146 bp->b_ioflags &= ~BIO_ERROR; 4147 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 4148 if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 4149 (bp->b_offset & PAGE_MASK) == 0) { 4150 if (bp->b_pages[0] == bogus_page) 4151 goto unlock; 4152 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 4153 VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); 4154 if ((bp->b_pages[0]->valid & mask) == mask) 4155 goto unlock; 4156 if ((bp->b_pages[0]->valid & mask) == 0) { 4157 pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); 4158 bp->b_pages[0]->valid |= mask; 4159 goto unlock; 4160 } 4161 } 4162 sa = bp->b_offset & PAGE_MASK; 4163 slide = 0; 4164 for (i = 0; i < bp->b_npages; i++, sa = 0) { 4165 slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); 4166 ea = slide & PAGE_MASK; 4167 if (ea == 0) 4168 ea = PAGE_SIZE; 4169 if (bp->b_pages[i] == bogus_page) 4170 continue; 4171 j = sa / DEV_BSIZE; 4172 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 4173 VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); 4174 if ((bp->b_pages[i]->valid & mask) == mask) 4175 continue; 4176 if ((bp->b_pages[i]->valid & mask) == 0) 4177 pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); 4178 else { 4179 for (; sa < ea; sa += DEV_BSIZE, j++) { 4180 if ((bp->b_pages[i]->valid & (1 << j)) == 0) { 4181 pmap_zero_page_area(bp->b_pages[i], 4182 sa, DEV_BSIZE); 4183 } 4184 } 4185 } 4186 bp->b_pages[i]->valid |= mask; 4187 } 4188 unlock: 4189 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 4190 bp->b_resid = 0; 4191 } 4192 4193 void 4194 vfs_bio_bzero_buf(struct buf *bp, int base, int size) 4195 { 4196 vm_page_t m; 4197 int i, n; 4198 4199 if ((bp->b_flags & B_UNMAPPED) == 0) { 4200 BUF_CHECK_MAPPED(bp); 4201 bzero(bp->b_data + base, size); 4202 } else { 4203 BUF_CHECK_UNMAPPED(bp); 4204 n = PAGE_SIZE - (base & PAGE_MASK); 4205 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { 4206 m = bp->b_pages[i]; 4207 if (n > size) 4208 n = size; 4209 pmap_zero_page_area(m, base & PAGE_MASK, n); 4210 base += n; 4211 size -= n; 4212 n = PAGE_SIZE; 4213 } 4214 } 4215 } 4216 4217 /* 4218 * vm_hold_load_pages and vm_hold_free_pages get pages into 4219 * a buffers address space. The pages are anonymous and are 4220 * not associated with a file object. 4221 */ 4222 static void 4223 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 4224 { 4225 vm_offset_t pg; 4226 vm_page_t p; 4227 int index; 4228 4229 BUF_CHECK_MAPPED(bp); 4230 4231 to = round_page(to); 4232 from = round_page(from); 4233 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4234 4235 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 4236 tryagain: 4237 /* 4238 * note: must allocate system pages since blocking here 4239 * could interfere with paging I/O, no matter which 4240 * process we are. 4241 */ 4242 p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | 4243 VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT)); 4244 if (p == NULL) { 4245 VM_WAIT; 4246 goto tryagain; 4247 } 4248 pmap_qenter(pg, &p, 1); 4249 bp->b_pages[index] = p; 4250 } 4251 bp->b_npages = index; 4252 } 4253 4254 /* Return pages associated with this buf to the vm system */ 4255 static void 4256 vm_hold_free_pages(struct buf *bp, int newbsize) 4257 { 4258 vm_offset_t from; 4259 vm_page_t p; 4260 int index, newnpages; 4261 4262 BUF_CHECK_MAPPED(bp); 4263 4264 from = round_page((vm_offset_t)bp->b_data + newbsize); 4265 newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4266 if (bp->b_npages > newnpages) 4267 pmap_qremove(from, bp->b_npages - newnpages); 4268 for (index = newnpages; index < bp->b_npages; index++) { 4269 p = bp->b_pages[index]; 4270 bp->b_pages[index] = NULL; 4271 if (p->busy != 0) 4272 printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", 4273 (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); 4274 p->wire_count--; 4275 vm_page_free(p); 4276 atomic_subtract_int(&cnt.v_wire_count, 1); 4277 } 4278 bp->b_npages = newnpages; 4279 } 4280 4281 /* 4282 * Map an IO request into kernel virtual address space. 4283 * 4284 * All requests are (re)mapped into kernel VA space. 4285 * Notice that we use b_bufsize for the size of the buffer 4286 * to be mapped. b_bcount might be modified by the driver. 4287 * 4288 * Note that even if the caller determines that the address space should 4289 * be valid, a race or a smaller-file mapped into a larger space may 4290 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST 4291 * check the return value. 4292 */ 4293 int 4294 vmapbuf(struct buf *bp, int mapbuf) 4295 { 4296 caddr_t kva; 4297 vm_prot_t prot; 4298 int pidx; 4299 4300 if (bp->b_bufsize < 0) 4301 return (-1); 4302 prot = VM_PROT_READ; 4303 if (bp->b_iocmd == BIO_READ) 4304 prot |= VM_PROT_WRITE; /* Less backwards than it looks */ 4305 if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, 4306 (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, 4307 btoc(MAXPHYS))) < 0) 4308 return (-1); 4309 bp->b_npages = pidx; 4310 if (mapbuf || !unmapped_buf_allowed) { 4311 pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx); 4312 kva = bp->b_saveaddr; 4313 bp->b_saveaddr = bp->b_data; 4314 bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK); 4315 bp->b_flags &= ~B_UNMAPPED; 4316 } else { 4317 bp->b_flags |= B_UNMAPPED; 4318 bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; 4319 bp->b_saveaddr = bp->b_data; 4320 bp->b_data = unmapped_buf; 4321 } 4322 return(0); 4323 } 4324 4325 /* 4326 * Free the io map PTEs associated with this IO operation. 4327 * We also invalidate the TLB entries and restore the original b_addr. 4328 */ 4329 void 4330 vunmapbuf(struct buf *bp) 4331 { 4332 int npages; 4333 4334 npages = bp->b_npages; 4335 if (bp->b_flags & B_UNMAPPED) 4336 bp->b_flags &= ~B_UNMAPPED; 4337 else 4338 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); 4339 vm_page_unhold_pages(bp->b_pages, npages); 4340 4341 bp->b_data = bp->b_saveaddr; 4342 } 4343 4344 void 4345 bdone(struct buf *bp) 4346 { 4347 struct mtx *mtxp; 4348 4349 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4350 mtx_lock(mtxp); 4351 bp->b_flags |= B_DONE; 4352 wakeup(bp); 4353 mtx_unlock(mtxp); 4354 } 4355 4356 void 4357 bwait(struct buf *bp, u_char pri, const char *wchan) 4358 { 4359 struct mtx *mtxp; 4360 4361 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4362 mtx_lock(mtxp); 4363 while ((bp->b_flags & B_DONE) == 0) 4364 msleep(bp, mtxp, pri, wchan, 0); 4365 mtx_unlock(mtxp); 4366 } 4367 4368 int 4369 bufsync(struct bufobj *bo, int waitfor) 4370 { 4371 4372 return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread)); 4373 } 4374 4375 void 4376 bufstrategy(struct bufobj *bo, struct buf *bp) 4377 { 4378 int i = 0; 4379 struct vnode *vp; 4380 4381 vp = bp->b_vp; 4382 KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); 4383 KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, 4384 ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); 4385 i = VOP_STRATEGY(vp, bp); 4386 KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); 4387 } 4388 4389 void 4390 bufobj_wrefl(struct bufobj *bo) 4391 { 4392 4393 KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); 4394 ASSERT_BO_WLOCKED(bo); 4395 bo->bo_numoutput++; 4396 } 4397 4398 void 4399 bufobj_wref(struct bufobj *bo) 4400 { 4401 4402 KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); 4403 BO_LOCK(bo); 4404 bo->bo_numoutput++; 4405 BO_UNLOCK(bo); 4406 } 4407 4408 void 4409 bufobj_wdrop(struct bufobj *bo) 4410 { 4411 4412 KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); 4413 BO_LOCK(bo); 4414 KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); 4415 if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { 4416 bo->bo_flag &= ~BO_WWAIT; 4417 wakeup(&bo->bo_numoutput); 4418 } 4419 BO_UNLOCK(bo); 4420 } 4421 4422 int 4423 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) 4424 { 4425 int error; 4426 4427 KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); 4428 ASSERT_BO_WLOCKED(bo); 4429 error = 0; 4430 while (bo->bo_numoutput) { 4431 bo->bo_flag |= BO_WWAIT; 4432 error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), 4433 slpflag | (PRIBIO + 1), "bo_wwait", timeo); 4434 if (error) 4435 break; 4436 } 4437 return (error); 4438 } 4439 4440 void 4441 bpin(struct buf *bp) 4442 { 4443 struct mtx *mtxp; 4444 4445 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4446 mtx_lock(mtxp); 4447 bp->b_pin_count++; 4448 mtx_unlock(mtxp); 4449 } 4450 4451 void 4452 bunpin(struct buf *bp) 4453 { 4454 struct mtx *mtxp; 4455 4456 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4457 mtx_lock(mtxp); 4458 if (--bp->b_pin_count == 0) 4459 wakeup(bp); 4460 mtx_unlock(mtxp); 4461 } 4462 4463 void 4464 bunpin_wait(struct buf *bp) 4465 { 4466 struct mtx *mtxp; 4467 4468 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4469 mtx_lock(mtxp); 4470 while (bp->b_pin_count > 0) 4471 msleep(bp, mtxp, PRIBIO, "bwunpin", 0); 4472 mtx_unlock(mtxp); 4473 } 4474 4475 /* 4476 * Set bio_data or bio_ma for struct bio from the struct buf. 4477 */ 4478 void 4479 bdata2bio(struct buf *bp, struct bio *bip) 4480 { 4481 4482 if ((bp->b_flags & B_UNMAPPED) != 0) { 4483 KASSERT(unmapped_buf_allowed, ("unmapped")); 4484 bip->bio_ma = bp->b_pages; 4485 bip->bio_ma_n = bp->b_npages; 4486 bip->bio_data = unmapped_buf; 4487 bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; 4488 bip->bio_flags |= BIO_UNMAPPED; 4489 KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / 4490 PAGE_SIZE == bp->b_npages, 4491 ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, 4492 (long long)bip->bio_length, bip->bio_ma_n)); 4493 } else { 4494 bip->bio_data = bp->b_data; 4495 bip->bio_ma = NULL; 4496 } 4497 } 4498 4499 #include "opt_ddb.h" 4500 #ifdef DDB 4501 #include <ddb/ddb.h> 4502 4503 /* DDB command to show buffer data */ 4504 DB_SHOW_COMMAND(buffer, db_show_buffer) 4505 { 4506 /* get args */ 4507 struct buf *bp = (struct buf *)addr; 4508 4509 if (!have_addr) { 4510 db_printf("usage: show buffer <addr>\n"); 4511 return; 4512 } 4513 4514 db_printf("buf at %p\n", bp); 4515 db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", 4516 (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, 4517 PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); 4518 db_printf( 4519 "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" 4520 "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " 4521 "b_dep = %p\n", 4522 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 4523 bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, 4524 (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); 4525 if (bp->b_npages) { 4526 int i; 4527 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); 4528 for (i = 0; i < bp->b_npages; i++) { 4529 vm_page_t m; 4530 m = bp->b_pages[i]; 4531 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, 4532 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); 4533 if ((i + 1) < bp->b_npages) 4534 db_printf(","); 4535 } 4536 db_printf("\n"); 4537 } 4538 db_printf(" "); 4539 BUF_LOCKPRINTINFO(bp); 4540 } 4541 4542 DB_SHOW_COMMAND(lockedbufs, lockedbufs) 4543 { 4544 struct buf *bp; 4545 int i; 4546 4547 for (i = 0; i < nbuf; i++) { 4548 bp = &buf[i]; 4549 if (BUF_ISLOCKED(bp)) { 4550 db_show_buffer((uintptr_t)bp, 1, 0, NULL); 4551 db_printf("\n"); 4552 } 4553 } 4554 } 4555 4556 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) 4557 { 4558 struct vnode *vp; 4559 struct buf *bp; 4560 4561 if (!have_addr) { 4562 db_printf("usage: show vnodebufs <addr>\n"); 4563 return; 4564 } 4565 vp = (struct vnode *)addr; 4566 db_printf("Clean buffers:\n"); 4567 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { 4568 db_show_buffer((uintptr_t)bp, 1, 0, NULL); 4569 db_printf("\n"); 4570 } 4571 db_printf("Dirty buffers:\n"); 4572 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { 4573 db_show_buffer((uintptr_t)bp, 1, 0, NULL); 4574 db_printf("\n"); 4575 } 4576 } 4577 4578 DB_COMMAND(countfreebufs, db_coundfreebufs) 4579 { 4580 struct buf *bp; 4581 int i, used = 0, nfree = 0; 4582 4583 if (have_addr) { 4584 db_printf("usage: countfreebufs\n"); 4585 return; 4586 } 4587 4588 for (i = 0; i < nbuf; i++) { 4589 bp = &buf[i]; 4590 if ((bp->b_flags & B_INFREECNT) != 0) 4591 nfree++; 4592 else 4593 used++; 4594 } 4595 4596 db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, 4597 nfree + used); 4598 db_printf("numfreebuffers is %d\n", numfreebuffers); 4599 } 4600 #endif /* DDB */ 4601