1 /*- 2 * Copyright (c) 2004 Poul-Henning Kamp 3 * Copyright (c) 1994,1997 John S. Dyson 4 * Copyright (c) 2013 The FreeBSD Foundation 5 * All rights reserved. 6 * 7 * Portions of this software were developed by Konstantin Belousov 8 * under sponsorship from the FreeBSD Foundation. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * this file contains a new buffer I/O scheme implementing a coherent 34 * VM object and buffer cache scheme. Pains have been taken to make 35 * sure that the performance degradation associated with schemes such 36 * as this is not realized. 37 * 38 * Author: John S. Dyson 39 * Significant help during the development and debugging phases 40 * had been provided by David Greenman, also of the FreeBSD core team. 41 * 42 * see man buf(9) for more info. 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/conf.h> 52 #include <sys/buf.h> 53 #include <sys/devicestat.h> 54 #include <sys/eventhandler.h> 55 #include <sys/fail.h> 56 #include <sys/limits.h> 57 #include <sys/lock.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/mutex.h> 61 #include <sys/kernel.h> 62 #include <sys/kthread.h> 63 #include <sys/proc.h> 64 #include <sys/resourcevar.h> 65 #include <sys/rwlock.h> 66 #include <sys/sysctl.h> 67 #include <sys/sysproto.h> 68 #include <sys/vmem.h> 69 #include <sys/vmmeter.h> 70 #include <sys/vnode.h> 71 #include <sys/watchdog.h> 72 #include <geom/geom.h> 73 #include <vm/vm.h> 74 #include <vm/vm_param.h> 75 #include <vm/vm_kern.h> 76 #include <vm/vm_pageout.h> 77 #include <vm/vm_page.h> 78 #include <vm/vm_object.h> 79 #include <vm/vm_extern.h> 80 #include <vm/vm_map.h> 81 #include <vm/swap_pager.h> 82 #include "opt_compat.h" 83 #include "opt_swap.h" 84 85 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); 86 87 struct bio_ops bioops; /* I/O operation notification */ 88 89 struct buf_ops buf_ops_bio = { 90 .bop_name = "buf_ops_bio", 91 .bop_write = bufwrite, 92 .bop_strategy = bufstrategy, 93 .bop_sync = bufsync, 94 .bop_bdflush = bufbdflush, 95 }; 96 97 static struct buf *buf; /* buffer header pool */ 98 extern struct buf *swbuf; /* Swap buffer header pool. */ 99 caddr_t unmapped_buf; 100 101 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ 102 struct proc *bufdaemonproc; 103 104 static int inmem(struct vnode *vp, daddr_t blkno); 105 static void vm_hold_free_pages(struct buf *bp, int newbsize); 106 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, 107 vm_offset_t to); 108 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); 109 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, 110 vm_page_t m); 111 static void vfs_clean_pages_dirty_buf(struct buf *bp); 112 static void vfs_setdirty_locked_object(struct buf *bp); 113 static void vfs_vmio_invalidate(struct buf *bp); 114 static void vfs_vmio_release(struct buf *bp); 115 static void vfs_vmio_truncate(struct buf *bp, int npages); 116 static void vfs_vmio_extend(struct buf *bp, int npages, int size); 117 static int vfs_bio_clcheck(struct vnode *vp, int size, 118 daddr_t lblkno, daddr_t blkno); 119 static int buf_flush(struct vnode *vp, int); 120 static int flushbufqueues(struct vnode *, int, int); 121 static void buf_daemon(void); 122 static void bremfreel(struct buf *bp); 123 static __inline void bd_wakeup(void); 124 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); 125 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ 126 defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) 127 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); 128 #endif 129 130 int vmiodirenable = TRUE; 131 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, 132 "Use the VM system for directory writes"); 133 long runningbufspace; 134 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, 135 "Amount of presently outstanding async buffer io"); 136 static long bufspace; 137 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ 138 defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) 139 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, 140 &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); 141 #else 142 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, 143 "Physical memory used for buffers"); 144 #endif 145 static long bufkvaspace; 146 SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, 147 "Kernel virtual memory used for buffers"); 148 static long maxbufspace; 149 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, 150 "Maximum allowed value of bufspace (including buf_daemon)"); 151 static long bufmallocspace; 152 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, 153 "Amount of malloced memory for buffers"); 154 static long maxbufmallocspace; 155 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, 156 "Maximum amount of malloced memory for buffers"); 157 static long lobufspace; 158 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, 159 "Minimum amount of buffers we want to have"); 160 long hibufspace; 161 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, 162 "Maximum allowed value of bufspace (excluding buf_daemon)"); 163 static int bufreusecnt; 164 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, 165 "Number of times we have reused a buffer"); 166 static int buffreekvacnt; 167 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, 168 "Number of times we have freed the KVA space from some buffer"); 169 static int bufdefragcnt; 170 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, 171 "Number of times we have had to repeat buffer allocation to defragment"); 172 static long lorunningspace; 173 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | 174 CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", 175 "Minimum preferred space used for in-progress I/O"); 176 static long hirunningspace; 177 SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | 178 CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", 179 "Maximum amount of space to use for in-progress I/O"); 180 int dirtybufferflushes; 181 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 182 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); 183 int bdwriteskip; 184 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 185 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); 186 int altbufferflushes; 187 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 188 0, "Number of fsync flushes to limit dirty buffers"); 189 static int recursiveflushes; 190 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 191 0, "Number of flushes skipped due to being recursive"); 192 static int numdirtybuffers; 193 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, 194 "Number of buffers that are dirty (has unwritten changes) at the moment"); 195 static int lodirtybuffers; 196 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, 197 "How many buffers we want to have free before bufdaemon can sleep"); 198 static int hidirtybuffers; 199 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, 200 "When the number of dirty buffers is considered severe"); 201 int dirtybufthresh; 202 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 203 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); 204 static int numfreebuffers; 205 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, 206 "Number of free buffers"); 207 static int lofreebuffers; 208 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, 209 "XXX Unused"); 210 static int hifreebuffers; 211 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, 212 "XXX Complicatedly unused"); 213 static int getnewbufcalls; 214 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, 215 "Number of calls to getnewbuf"); 216 static int getnewbufrestarts; 217 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, 218 "Number of times getnewbuf has had to restart a buffer aquisition"); 219 static int mappingrestarts; 220 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, 221 "Number of times getblk has had to restart a buffer mapping for " 222 "unmapped buffer"); 223 static int flushbufqtarget = 100; 224 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, 225 "Amount of work to do in flushbufqueues when helping bufdaemon"); 226 static long notbufdflushes; 227 SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, 228 "Number of dirty buffer flushes done by the bufdaemon helpers"); 229 static long barrierwrites; 230 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, 231 "Number of barrier writes"); 232 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, 233 &unmapped_buf_allowed, 0, 234 "Permit the use of the unmapped i/o"); 235 236 /* 237 * Lock for the non-dirty bufqueues 238 */ 239 static struct mtx_padalign bqclean; 240 241 /* 242 * Lock for the dirty queue. 243 */ 244 static struct mtx_padalign bqdirty; 245 246 /* 247 * This lock synchronizes access to bd_request. 248 */ 249 static struct mtx_padalign bdlock; 250 251 /* 252 * This lock protects the runningbufreq and synchronizes runningbufwakeup and 253 * waitrunningbufspace(). 254 */ 255 static struct mtx_padalign rbreqlock; 256 257 /* 258 * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. 259 */ 260 static struct rwlock_padalign nblock; 261 262 /* 263 * Lock that protects bdirtywait. 264 */ 265 static struct mtx_padalign bdirtylock; 266 267 /* 268 * Wakeup point for bufdaemon, as well as indicator of whether it is already 269 * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it 270 * is idling. 271 */ 272 static int bd_request; 273 274 /* 275 * Request for the buf daemon to write more buffers than is indicated by 276 * lodirtybuf. This may be necessary to push out excess dependencies or 277 * defragment the address space where a simple count of the number of dirty 278 * buffers is insufficient to characterize the demand for flushing them. 279 */ 280 static int bd_speedupreq; 281 282 /* 283 * bogus page -- for I/O to/from partially complete buffers 284 * this is a temporary solution to the problem, but it is not 285 * really that bad. it would be better to split the buffer 286 * for input in the case of buffers partially already in memory, 287 * but the code is intricate enough already. 288 */ 289 vm_page_t bogus_page; 290 291 /* 292 * Synchronization (sleep/wakeup) variable for active buffer space requests. 293 * Set when wait starts, cleared prior to wakeup(). 294 * Used in runningbufwakeup() and waitrunningbufspace(). 295 */ 296 static int runningbufreq; 297 298 /* 299 * Synchronization (sleep/wakeup) variable for buffer requests. 300 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done 301 * by and/or. 302 * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(), 303 * getnewbuf(), and getblk(). 304 */ 305 static volatile int needsbuffer; 306 307 /* 308 * Synchronization for bwillwrite() waiters. 309 */ 310 static int bdirtywait; 311 312 /* 313 * Definitions for the buffer free lists. 314 */ 315 #define BUFFER_QUEUES 4 /* number of free buffer queues */ 316 317 #define QUEUE_NONE 0 /* on no queue */ 318 #define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ 319 #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ 320 #define QUEUE_EMPTY 3 /* empty buffer headers */ 321 #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ 322 323 /* Queues for free buffers with various properties */ 324 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; 325 #ifdef INVARIANTS 326 static int bq_len[BUFFER_QUEUES]; 327 #endif 328 329 /* 330 * Single global constant for BUF_WMESG, to avoid getting multiple references. 331 * buf_wmesg is referred from macros. 332 */ 333 const char *buf_wmesg = BUF_WMESG; 334 335 #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ 336 #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ 337 #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ 338 339 static int 340 sysctl_runningspace(SYSCTL_HANDLER_ARGS) 341 { 342 long value; 343 int error; 344 345 value = *(long *)arg1; 346 error = sysctl_handle_long(oidp, &value, 0, req); 347 if (error != 0 || req->newptr == NULL) 348 return (error); 349 mtx_lock(&rbreqlock); 350 if (arg1 == &hirunningspace) { 351 if (value < lorunningspace) 352 error = EINVAL; 353 else 354 hirunningspace = value; 355 } else { 356 KASSERT(arg1 == &lorunningspace, 357 ("%s: unknown arg1", __func__)); 358 if (value > hirunningspace) 359 error = EINVAL; 360 else 361 lorunningspace = value; 362 } 363 mtx_unlock(&rbreqlock); 364 return (error); 365 } 366 367 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ 368 defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) 369 static int 370 sysctl_bufspace(SYSCTL_HANDLER_ARGS) 371 { 372 long lvalue; 373 int ivalue; 374 375 if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) 376 return (sysctl_handle_long(oidp, arg1, arg2, req)); 377 lvalue = *(long *)arg1; 378 if (lvalue > INT_MAX) 379 /* On overflow, still write out a long to trigger ENOMEM. */ 380 return (sysctl_handle_long(oidp, &lvalue, 0, req)); 381 ivalue = lvalue; 382 return (sysctl_handle_int(oidp, &ivalue, 0, req)); 383 } 384 #endif 385 386 /* 387 * bqlock: 388 * 389 * Return the appropriate queue lock based on the index. 390 */ 391 static inline struct mtx * 392 bqlock(int qindex) 393 { 394 395 if (qindex == QUEUE_DIRTY) 396 return (struct mtx *)(&bqdirty); 397 return (struct mtx *)(&bqclean); 398 } 399 400 /* 401 * bdirtywakeup: 402 * 403 * Wakeup any bwillwrite() waiters. 404 */ 405 static void 406 bdirtywakeup(void) 407 { 408 mtx_lock(&bdirtylock); 409 if (bdirtywait) { 410 bdirtywait = 0; 411 wakeup(&bdirtywait); 412 } 413 mtx_unlock(&bdirtylock); 414 } 415 416 /* 417 * bdirtysub: 418 * 419 * Decrement the numdirtybuffers count by one and wakeup any 420 * threads blocked in bwillwrite(). 421 */ 422 static void 423 bdirtysub(void) 424 { 425 426 if (atomic_fetchadd_int(&numdirtybuffers, -1) == 427 (lodirtybuffers + hidirtybuffers) / 2) 428 bdirtywakeup(); 429 } 430 431 /* 432 * bdirtyadd: 433 * 434 * Increment the numdirtybuffers count by one and wakeup the buf 435 * daemon if needed. 436 */ 437 static void 438 bdirtyadd(void) 439 { 440 441 /* 442 * Only do the wakeup once as we cross the boundary. The 443 * buf daemon will keep running until the condition clears. 444 */ 445 if (atomic_fetchadd_int(&numdirtybuffers, 1) == 446 (lodirtybuffers + hidirtybuffers) / 2) 447 bd_wakeup(); 448 } 449 450 /* 451 * bufspacewakeup: 452 * 453 * Called when buffer space is potentially available for recovery. 454 * getnewbuf() will block on this flag when it is unable to free 455 * sufficient buffer space. Buffer space becomes recoverable when 456 * bp's get placed back in the queues. 457 */ 458 static __inline void 459 bufspacewakeup(void) 460 { 461 int need_wakeup, on; 462 463 /* 464 * If someone is waiting for bufspace, wake them up. Even 465 * though we may not have freed the kva space yet, the waiting 466 * process will be able to now. 467 */ 468 rw_rlock(&nblock); 469 for (;;) { 470 need_wakeup = 0; 471 on = needsbuffer; 472 if ((on & VFS_BIO_NEED_BUFSPACE) == 0) 473 break; 474 need_wakeup = 1; 475 if (atomic_cmpset_rel_int(&needsbuffer, on, 476 on & ~VFS_BIO_NEED_BUFSPACE)) 477 break; 478 } 479 if (need_wakeup) 480 wakeup(__DEVOLATILE(void *, &needsbuffer)); 481 rw_runlock(&nblock); 482 } 483 484 /* 485 * bufspaceadjust: 486 * 487 * Adjust the reported bufspace for a KVA managed buffer, possibly 488 * waking any waiters. 489 */ 490 static void 491 bufspaceadjust(struct buf *bp, int bufsize) 492 { 493 int diff; 494 495 KASSERT((bp->b_flags & B_MALLOC) == 0, 496 ("bufspaceadjust: malloc buf %p", bp)); 497 diff = bufsize - bp->b_bufsize; 498 if (diff < 0) { 499 atomic_subtract_long(&bufspace, -diff); 500 bufspacewakeup(); 501 } else 502 atomic_add_long(&bufspace, diff); 503 bp->b_bufsize = bufsize; 504 } 505 506 /* 507 * bufmallocadjust: 508 * 509 * Adjust the reported bufspace for a malloc managed buffer, possibly 510 * waking any waiters. 511 */ 512 static void 513 bufmallocadjust(struct buf *bp, int bufsize) 514 { 515 int diff; 516 517 KASSERT((bp->b_flags & B_MALLOC) != 0, 518 ("bufmallocadjust: non-malloc buf %p", bp)); 519 diff = bufsize - bp->b_bufsize; 520 if (diff < 0) { 521 atomic_subtract_long(&bufmallocspace, -diff); 522 bufspacewakeup(); 523 } else 524 atomic_add_long(&bufmallocspace, diff); 525 bp->b_bufsize = bufsize; 526 } 527 528 /* 529 * runningwakeup: 530 * 531 * Wake up processes that are waiting on asynchronous writes to fall 532 * below lorunningspace. 533 */ 534 static void 535 runningwakeup(void) 536 { 537 538 mtx_lock(&rbreqlock); 539 if (runningbufreq) { 540 runningbufreq = 0; 541 wakeup(&runningbufreq); 542 } 543 mtx_unlock(&rbreqlock); 544 } 545 546 /* 547 * runningbufwakeup: 548 * 549 * Decrement the outstanding write count according. 550 */ 551 void 552 runningbufwakeup(struct buf *bp) 553 { 554 long space, bspace; 555 556 bspace = bp->b_runningbufspace; 557 if (bspace == 0) 558 return; 559 space = atomic_fetchadd_long(&runningbufspace, -bspace); 560 KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", 561 space, bspace)); 562 bp->b_runningbufspace = 0; 563 /* 564 * Only acquire the lock and wakeup on the transition from exceeding 565 * the threshold to falling below it. 566 */ 567 if (space < lorunningspace) 568 return; 569 if (space - bspace > lorunningspace) 570 return; 571 runningwakeup(); 572 } 573 574 /* 575 * bufcountadd: 576 * 577 * Called when a buffer has been added to one of the free queues to 578 * account for the buffer and to wakeup anyone waiting for free buffers. 579 * This typically occurs when large amounts of metadata are being handled 580 * by the buffer cache ( else buffer space runs out first, usually ). 581 */ 582 static __inline void 583 bufcountadd(struct buf *bp) 584 { 585 int mask, need_wakeup, old, on; 586 587 KASSERT((bp->b_flags & B_INFREECNT) == 0, 588 ("buf %p already counted as free", bp)); 589 bp->b_flags |= B_INFREECNT; 590 old = atomic_fetchadd_int(&numfreebuffers, 1); 591 KASSERT(old >= 0 && old < nbuf, 592 ("numfreebuffers climbed to %d", old + 1)); 593 mask = VFS_BIO_NEED_ANY; 594 if (numfreebuffers >= hifreebuffers) 595 mask |= VFS_BIO_NEED_FREE; 596 rw_rlock(&nblock); 597 for (;;) { 598 need_wakeup = 0; 599 on = needsbuffer; 600 if (on == 0) 601 break; 602 need_wakeup = 1; 603 if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask)) 604 break; 605 } 606 if (need_wakeup) 607 wakeup(__DEVOLATILE(void *, &needsbuffer)); 608 rw_runlock(&nblock); 609 } 610 611 /* 612 * bufcountsub: 613 * 614 * Decrement the numfreebuffers count as needed. 615 */ 616 static void 617 bufcountsub(struct buf *bp) 618 { 619 int old; 620 621 /* 622 * Fixup numfreebuffers count. If the buffer is invalid or not 623 * delayed-write, the buffer was free and we must decrement 624 * numfreebuffers. 625 */ 626 if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { 627 KASSERT((bp->b_flags & B_INFREECNT) != 0, 628 ("buf %p not counted in numfreebuffers", bp)); 629 bp->b_flags &= ~B_INFREECNT; 630 old = atomic_fetchadd_int(&numfreebuffers, -1); 631 KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1)); 632 } 633 } 634 635 /* 636 * waitrunningbufspace() 637 * 638 * runningbufspace is a measure of the amount of I/O currently 639 * running. This routine is used in async-write situations to 640 * prevent creating huge backups of pending writes to a device. 641 * Only asynchronous writes are governed by this function. 642 * 643 * This does NOT turn an async write into a sync write. It waits 644 * for earlier writes to complete and generally returns before the 645 * caller's write has reached the device. 646 */ 647 void 648 waitrunningbufspace(void) 649 { 650 651 mtx_lock(&rbreqlock); 652 while (runningbufspace > hirunningspace) { 653 runningbufreq = 1; 654 msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); 655 } 656 mtx_unlock(&rbreqlock); 657 } 658 659 660 /* 661 * vfs_buf_test_cache: 662 * 663 * Called when a buffer is extended. This function clears the B_CACHE 664 * bit if the newly extended portion of the buffer does not contain 665 * valid data. 666 */ 667 static __inline void 668 vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, 669 vm_offset_t size, vm_page_t m) 670 { 671 672 VM_OBJECT_ASSERT_LOCKED(m->object); 673 if (bp->b_flags & B_CACHE) { 674 int base = (foff + off) & PAGE_MASK; 675 if (vm_page_is_valid(m, base, size) == 0) 676 bp->b_flags &= ~B_CACHE; 677 } 678 } 679 680 /* Wake up the buffer daemon if necessary */ 681 static __inline void 682 bd_wakeup(void) 683 { 684 685 mtx_lock(&bdlock); 686 if (bd_request == 0) { 687 bd_request = 1; 688 wakeup(&bd_request); 689 } 690 mtx_unlock(&bdlock); 691 } 692 693 /* 694 * bd_speedup - speedup the buffer cache flushing code 695 */ 696 void 697 bd_speedup(void) 698 { 699 int needwake; 700 701 mtx_lock(&bdlock); 702 needwake = 0; 703 if (bd_speedupreq == 0 || bd_request == 0) 704 needwake = 1; 705 bd_speedupreq = 1; 706 bd_request = 1; 707 if (needwake) 708 wakeup(&bd_request); 709 mtx_unlock(&bdlock); 710 } 711 712 #ifndef NSWBUF_MIN 713 #define NSWBUF_MIN 16 714 #endif 715 716 #ifdef __i386__ 717 #define TRANSIENT_DENOM 5 718 #else 719 #define TRANSIENT_DENOM 10 720 #endif 721 722 /* 723 * Calculating buffer cache scaling values and reserve space for buffer 724 * headers. This is called during low level kernel initialization and 725 * may be called more then once. We CANNOT write to the memory area 726 * being reserved at this time. 727 */ 728 caddr_t 729 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) 730 { 731 int tuned_nbuf; 732 long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; 733 734 /* 735 * physmem_est is in pages. Convert it to kilobytes (assumes 736 * PAGE_SIZE is >= 1K) 737 */ 738 physmem_est = physmem_est * (PAGE_SIZE / 1024); 739 740 /* 741 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 742 * For the first 64MB of ram nominally allocate sufficient buffers to 743 * cover 1/4 of our ram. Beyond the first 64MB allocate additional 744 * buffers to cover 1/10 of our ram over 64MB. When auto-sizing 745 * the buffer cache we limit the eventual kva reservation to 746 * maxbcache bytes. 747 * 748 * factor represents the 1/4 x ram conversion. 749 */ 750 if (nbuf == 0) { 751 int factor = 4 * BKVASIZE / 1024; 752 753 nbuf = 50; 754 if (physmem_est > 4096) 755 nbuf += min((physmem_est - 4096) / factor, 756 65536 / factor); 757 if (physmem_est > 65536) 758 nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 759 32 * 1024 * 1024 / (factor * 5)); 760 761 if (maxbcache && nbuf > maxbcache / BKVASIZE) 762 nbuf = maxbcache / BKVASIZE; 763 tuned_nbuf = 1; 764 } else 765 tuned_nbuf = 0; 766 767 /* XXX Avoid unsigned long overflows later on with maxbufspace. */ 768 maxbuf = (LONG_MAX / 3) / BKVASIZE; 769 if (nbuf > maxbuf) { 770 if (!tuned_nbuf) 771 printf("Warning: nbufs lowered from %d to %ld\n", nbuf, 772 maxbuf); 773 nbuf = maxbuf; 774 } 775 776 /* 777 * Ideal allocation size for the transient bio submap is 10% 778 * of the maximal space buffer map. This roughly corresponds 779 * to the amount of the buffer mapped for typical UFS load. 780 * 781 * Clip the buffer map to reserve space for the transient 782 * BIOs, if its extent is bigger than 90% (80% on i386) of the 783 * maximum buffer map extent on the platform. 784 * 785 * The fall-back to the maxbuf in case of maxbcache unset, 786 * allows to not trim the buffer KVA for the architectures 787 * with ample KVA space. 788 */ 789 if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { 790 maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; 791 buf_sz = (long)nbuf * BKVASIZE; 792 if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * 793 (TRANSIENT_DENOM - 1)) { 794 /* 795 * There is more KVA than memory. Do not 796 * adjust buffer map size, and assign the rest 797 * of maxbuf to transient map. 798 */ 799 biotmap_sz = maxbuf_sz - buf_sz; 800 } else { 801 /* 802 * Buffer map spans all KVA we could afford on 803 * this platform. Give 10% (20% on i386) of 804 * the buffer map to the transient bio map. 805 */ 806 biotmap_sz = buf_sz / TRANSIENT_DENOM; 807 buf_sz -= biotmap_sz; 808 } 809 if (biotmap_sz / INT_MAX > MAXPHYS) 810 bio_transient_maxcnt = INT_MAX; 811 else 812 bio_transient_maxcnt = biotmap_sz / MAXPHYS; 813 /* 814 * Artifically limit to 1024 simultaneous in-flight I/Os 815 * using the transient mapping. 816 */ 817 if (bio_transient_maxcnt > 1024) 818 bio_transient_maxcnt = 1024; 819 if (tuned_nbuf) 820 nbuf = buf_sz / BKVASIZE; 821 } 822 823 /* 824 * swbufs are used as temporary holders for I/O, such as paging I/O. 825 * We have no less then 16 and no more then 256. 826 */ 827 nswbuf = min(nbuf / 4, 256); 828 TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); 829 if (nswbuf < NSWBUF_MIN) 830 nswbuf = NSWBUF_MIN; 831 832 /* 833 * Reserve space for the buffer cache buffers 834 */ 835 swbuf = (void *)v; 836 v = (caddr_t)(swbuf + nswbuf); 837 buf = (void *)v; 838 v = (caddr_t)(buf + nbuf); 839 840 return(v); 841 } 842 843 /* Initialize the buffer subsystem. Called before use of any buffers. */ 844 void 845 bufinit(void) 846 { 847 struct buf *bp; 848 int i; 849 850 CTASSERT(MAXBCACHEBUF >= MAXBSIZE); 851 mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF); 852 mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF); 853 mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); 854 rw_init(&nblock, "needsbuffer lock"); 855 mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); 856 mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); 857 858 /* next, make a null set of free lists */ 859 for (i = 0; i < BUFFER_QUEUES; i++) 860 TAILQ_INIT(&bufqueues[i]); 861 862 unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); 863 864 /* finally, initialize each buffer header and stick on empty q */ 865 for (i = 0; i < nbuf; i++) { 866 bp = &buf[i]; 867 bzero(bp, sizeof *bp); 868 bp->b_flags = B_INVAL | B_INFREECNT; 869 bp->b_rcred = NOCRED; 870 bp->b_wcred = NOCRED; 871 bp->b_qindex = QUEUE_EMPTY; 872 bp->b_xflags = 0; 873 bp->b_data = bp->b_kvabase = unmapped_buf; 874 LIST_INIT(&bp->b_dep); 875 BUF_LOCKINIT(bp); 876 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 877 #ifdef INVARIANTS 878 bq_len[QUEUE_EMPTY]++; 879 #endif 880 } 881 882 /* 883 * maxbufspace is the absolute maximum amount of buffer space we are 884 * allowed to reserve in KVM and in real terms. The absolute maximum 885 * is nominally used by buf_daemon. hibufspace is the nominal maximum 886 * used by most other processes. The differential is required to 887 * ensure that buf_daemon is able to run when other processes might 888 * be blocked waiting for buffer space. 889 * 890 * maxbufspace is based on BKVASIZE. Allocating buffers larger then 891 * this may result in KVM fragmentation which is not handled optimally 892 * by the system. 893 */ 894 maxbufspace = (long)nbuf * BKVASIZE; 895 hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10); 896 lobufspace = hibufspace - MAXBCACHEBUF; 897 898 /* 899 * Note: The 16 MiB upper limit for hirunningspace was chosen 900 * arbitrarily and may need further tuning. It corresponds to 901 * 128 outstanding write IO requests (if IO size is 128 KiB), 902 * which fits with many RAID controllers' tagged queuing limits. 903 * The lower 1 MiB limit is the historical upper limit for 904 * hirunningspace. 905 */ 906 hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF), 907 16 * 1024 * 1024), 1024 * 1024); 908 lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF); 909 910 /* 911 * Limit the amount of malloc memory since it is wired permanently into 912 * the kernel space. Even though this is accounted for in the buffer 913 * allocation, we don't want the malloced region to grow uncontrolled. 914 * The malloc scheme improves memory utilization significantly on average 915 * (small) directories. 916 */ 917 maxbufmallocspace = hibufspace / 20; 918 919 /* 920 * Reduce the chance of a deadlock occuring by limiting the number 921 * of delayed-write dirty buffers we allow to stack up. 922 */ 923 hidirtybuffers = nbuf / 4 + 20; 924 dirtybufthresh = hidirtybuffers * 9 / 10; 925 numdirtybuffers = 0; 926 /* 927 * To support extreme low-memory systems, make sure hidirtybuffers cannot 928 * eat up all available buffer space. This occurs when our minimum cannot 929 * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming 930 * BKVASIZE'd buffers. 931 */ 932 while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { 933 hidirtybuffers >>= 1; 934 } 935 lodirtybuffers = hidirtybuffers / 2; 936 937 /* 938 * Try to keep the number of free buffers in the specified range, 939 * and give special processes (e.g. like buf_daemon) access to an 940 * emergency reserve. 941 */ 942 lofreebuffers = nbuf / 18 + 5; 943 hifreebuffers = 2 * lofreebuffers; 944 numfreebuffers = nbuf; 945 946 bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | 947 VM_ALLOC_NORMAL | VM_ALLOC_WIRED); 948 } 949 950 #ifdef INVARIANTS 951 static inline void 952 vfs_buf_check_mapped(struct buf *bp) 953 { 954 955 KASSERT(bp->b_kvabase != unmapped_buf, 956 ("mapped buf: b_kvabase was not updated %p", bp)); 957 KASSERT(bp->b_data != unmapped_buf, 958 ("mapped buf: b_data was not updated %p", bp)); 959 KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + 960 MAXPHYS, ("b_data + b_offset unmapped %p", bp)); 961 } 962 963 static inline void 964 vfs_buf_check_unmapped(struct buf *bp) 965 { 966 967 KASSERT(bp->b_data == unmapped_buf, 968 ("unmapped buf: corrupted b_data %p", bp)); 969 } 970 971 #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) 972 #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) 973 #else 974 #define BUF_CHECK_MAPPED(bp) do {} while (0) 975 #define BUF_CHECK_UNMAPPED(bp) do {} while (0) 976 #endif 977 978 static int 979 isbufbusy(struct buf *bp) 980 { 981 if (((bp->b_flags & (B_INVAL | B_PERSISTENT)) == 0 && 982 BUF_ISLOCKED(bp)) || 983 ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) 984 return (1); 985 return (0); 986 } 987 988 /* 989 * Shutdown the system cleanly to prepare for reboot, halt, or power off. 990 */ 991 void 992 bufshutdown(int show_busybufs) 993 { 994 static int first_buf_printf = 1; 995 struct buf *bp; 996 int iter, nbusy, pbusy; 997 #ifndef PREEMPTION 998 int subiter; 999 #endif 1000 1001 /* 1002 * Sync filesystems for shutdown 1003 */ 1004 wdog_kern_pat(WD_LASTVAL); 1005 sys_sync(curthread, NULL); 1006 1007 /* 1008 * With soft updates, some buffers that are 1009 * written will be remarked as dirty until other 1010 * buffers are written. 1011 */ 1012 for (iter = pbusy = 0; iter < 20; iter++) { 1013 nbusy = 0; 1014 for (bp = &buf[nbuf]; --bp >= buf; ) 1015 if (isbufbusy(bp)) 1016 nbusy++; 1017 if (nbusy == 0) { 1018 if (first_buf_printf) 1019 printf("All buffers synced."); 1020 break; 1021 } 1022 if (first_buf_printf) { 1023 printf("Syncing disks, buffers remaining... "); 1024 first_buf_printf = 0; 1025 } 1026 printf("%d ", nbusy); 1027 if (nbusy < pbusy) 1028 iter = 0; 1029 pbusy = nbusy; 1030 1031 wdog_kern_pat(WD_LASTVAL); 1032 sys_sync(curthread, NULL); 1033 1034 #ifdef PREEMPTION 1035 /* 1036 * Drop Giant and spin for a while to allow 1037 * interrupt threads to run. 1038 */ 1039 DROP_GIANT(); 1040 DELAY(50000 * iter); 1041 PICKUP_GIANT(); 1042 #else 1043 /* 1044 * Drop Giant and context switch several times to 1045 * allow interrupt threads to run. 1046 */ 1047 DROP_GIANT(); 1048 for (subiter = 0; subiter < 50 * iter; subiter++) { 1049 thread_lock(curthread); 1050 mi_switch(SW_VOL, NULL); 1051 thread_unlock(curthread); 1052 DELAY(1000); 1053 } 1054 PICKUP_GIANT(); 1055 #endif 1056 } 1057 printf("\n"); 1058 /* 1059 * Count only busy local buffers to prevent forcing 1060 * a fsck if we're just a client of a wedged NFS server 1061 */ 1062 nbusy = 0; 1063 for (bp = &buf[nbuf]; --bp >= buf; ) { 1064 if (isbufbusy(bp)) { 1065 #if 0 1066 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ 1067 if (bp->b_dev == NULL) { 1068 TAILQ_REMOVE(&mountlist, 1069 bp->b_vp->v_mount, mnt_list); 1070 continue; 1071 } 1072 #endif 1073 nbusy++; 1074 if (show_busybufs > 0) { 1075 printf( 1076 "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", 1077 nbusy, bp, bp->b_vp, bp->b_flags, 1078 (intmax_t)bp->b_blkno, 1079 (intmax_t)bp->b_lblkno); 1080 BUF_LOCKPRINTINFO(bp); 1081 if (show_busybufs > 1) 1082 vn_printf(bp->b_vp, 1083 "vnode content: "); 1084 } 1085 } 1086 } 1087 if (nbusy) { 1088 /* 1089 * Failed to sync all blocks. Indicate this and don't 1090 * unmount filesystems (thus forcing an fsck on reboot). 1091 */ 1092 printf("Giving up on %d buffers\n", nbusy); 1093 DELAY(5000000); /* 5 seconds */ 1094 } else { 1095 if (!first_buf_printf) 1096 printf("Final sync complete\n"); 1097 /* 1098 * Unmount filesystems 1099 */ 1100 if (panicstr == 0) 1101 vfs_unmountall(); 1102 } 1103 swapoff_all(); 1104 DELAY(100000); /* wait for console output to finish */ 1105 } 1106 1107 static void 1108 bpmap_qenter(struct buf *bp) 1109 { 1110 1111 BUF_CHECK_MAPPED(bp); 1112 1113 /* 1114 * bp->b_data is relative to bp->b_offset, but 1115 * bp->b_offset may be offset into the first page. 1116 */ 1117 bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); 1118 pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); 1119 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 1120 (vm_offset_t)(bp->b_offset & PAGE_MASK)); 1121 } 1122 1123 /* 1124 * binsfree: 1125 * 1126 * Insert the buffer into the appropriate free list. 1127 */ 1128 static void 1129 binsfree(struct buf *bp, int qindex) 1130 { 1131 struct mtx *olock, *nlock; 1132 1133 BUF_ASSERT_XLOCKED(bp); 1134 1135 nlock = bqlock(qindex); 1136 /* Handle delayed bremfree() processing. */ 1137 if (bp->b_flags & B_REMFREE) { 1138 olock = bqlock(bp->b_qindex); 1139 mtx_lock(olock); 1140 bremfreel(bp); 1141 if (olock != nlock) { 1142 mtx_unlock(olock); 1143 mtx_lock(nlock); 1144 } 1145 } else 1146 mtx_lock(nlock); 1147 1148 if (bp->b_qindex != QUEUE_NONE) 1149 panic("binsfree: free buffer onto another queue???"); 1150 1151 bp->b_qindex = qindex; 1152 if (bp->b_flags & B_AGE) 1153 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 1154 else 1155 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); 1156 #ifdef INVARIANTS 1157 bq_len[bp->b_qindex]++; 1158 #endif 1159 mtx_unlock(nlock); 1160 1161 /* 1162 * Something we can maybe free or reuse. 1163 */ 1164 if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) 1165 bufspacewakeup(); 1166 1167 if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) 1168 bufcountadd(bp); 1169 } 1170 1171 /* 1172 * bremfree: 1173 * 1174 * Mark the buffer for removal from the appropriate free list. 1175 * 1176 */ 1177 void 1178 bremfree(struct buf *bp) 1179 { 1180 1181 CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1182 KASSERT((bp->b_flags & B_REMFREE) == 0, 1183 ("bremfree: buffer %p already marked for delayed removal.", bp)); 1184 KASSERT(bp->b_qindex != QUEUE_NONE, 1185 ("bremfree: buffer %p not on a queue.", bp)); 1186 BUF_ASSERT_XLOCKED(bp); 1187 1188 bp->b_flags |= B_REMFREE; 1189 bufcountsub(bp); 1190 } 1191 1192 /* 1193 * bremfreef: 1194 * 1195 * Force an immediate removal from a free list. Used only in nfs when 1196 * it abuses the b_freelist pointer. 1197 */ 1198 void 1199 bremfreef(struct buf *bp) 1200 { 1201 struct mtx *qlock; 1202 1203 qlock = bqlock(bp->b_qindex); 1204 mtx_lock(qlock); 1205 bremfreel(bp); 1206 mtx_unlock(qlock); 1207 } 1208 1209 /* 1210 * bremfreel: 1211 * 1212 * Removes a buffer from the free list, must be called with the 1213 * correct qlock held. 1214 */ 1215 static void 1216 bremfreel(struct buf *bp) 1217 { 1218 1219 CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", 1220 bp, bp->b_vp, bp->b_flags); 1221 KASSERT(bp->b_qindex != QUEUE_NONE, 1222 ("bremfreel: buffer %p not on a queue.", bp)); 1223 BUF_ASSERT_XLOCKED(bp); 1224 mtx_assert(bqlock(bp->b_qindex), MA_OWNED); 1225 1226 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 1227 #ifdef INVARIANTS 1228 KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", 1229 bp->b_qindex)); 1230 bq_len[bp->b_qindex]--; 1231 #endif 1232 bp->b_qindex = QUEUE_NONE; 1233 /* 1234 * If this was a delayed bremfree() we only need to remove the buffer 1235 * from the queue and return the stats are already done. 1236 */ 1237 if (bp->b_flags & B_REMFREE) { 1238 bp->b_flags &= ~B_REMFREE; 1239 return; 1240 } 1241 bufcountsub(bp); 1242 } 1243 1244 /* 1245 * bufkvafree: 1246 * 1247 * Free the kva allocation for a buffer. 1248 * 1249 */ 1250 static void 1251 bufkvafree(struct buf *bp) 1252 { 1253 1254 #ifdef INVARIANTS 1255 if (bp->b_kvasize == 0) { 1256 KASSERT(bp->b_kvabase == unmapped_buf && 1257 bp->b_data == unmapped_buf, 1258 ("Leaked KVA space on %p", bp)); 1259 } else if (buf_mapped(bp)) 1260 BUF_CHECK_MAPPED(bp); 1261 else 1262 BUF_CHECK_UNMAPPED(bp); 1263 #endif 1264 if (bp->b_kvasize == 0) 1265 return; 1266 1267 vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); 1268 atomic_subtract_long(&bufkvaspace, bp->b_kvasize); 1269 atomic_add_int(&buffreekvacnt, 1); 1270 bp->b_data = bp->b_kvabase = unmapped_buf; 1271 bp->b_kvasize = 0; 1272 } 1273 1274 /* 1275 * bufkvaalloc: 1276 * 1277 * Allocate the buffer KVA and set b_kvasize and b_kvabase. 1278 */ 1279 static int 1280 bufkvaalloc(struct buf *bp, int maxsize, int gbflags) 1281 { 1282 vm_offset_t addr; 1283 int error; 1284 1285 KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, 1286 ("Invalid gbflags 0x%x in %s", gbflags, __func__)); 1287 1288 bufkvafree(bp); 1289 1290 addr = 0; 1291 error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); 1292 if (error != 0) { 1293 /* 1294 * Buffer map is too fragmented. Request the caller 1295 * to defragment the map. 1296 */ 1297 atomic_add_int(&bufdefragcnt, 1); 1298 return (error); 1299 } 1300 bp->b_kvabase = (caddr_t)addr; 1301 bp->b_kvasize = maxsize; 1302 atomic_add_long(&bufkvaspace, bp->b_kvasize); 1303 if ((gbflags & GB_UNMAPPED) != 0) { 1304 bp->b_data = unmapped_buf; 1305 BUF_CHECK_UNMAPPED(bp); 1306 } else { 1307 bp->b_data = bp->b_kvabase; 1308 BUF_CHECK_MAPPED(bp); 1309 } 1310 return (0); 1311 } 1312 1313 /* 1314 * Attempt to initiate asynchronous I/O on read-ahead blocks. We must 1315 * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, 1316 * the buffer is valid and we do not have to do anything. 1317 */ 1318 void 1319 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, 1320 int cnt, struct ucred * cred) 1321 { 1322 struct buf *rabp; 1323 int i; 1324 1325 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 1326 if (inmem(vp, *rablkno)) 1327 continue; 1328 rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); 1329 1330 if ((rabp->b_flags & B_CACHE) == 0) { 1331 if (!TD_IS_IDLETHREAD(curthread)) 1332 curthread->td_ru.ru_inblock++; 1333 rabp->b_flags |= B_ASYNC; 1334 rabp->b_flags &= ~B_INVAL; 1335 rabp->b_ioflags &= ~BIO_ERROR; 1336 rabp->b_iocmd = BIO_READ; 1337 if (rabp->b_rcred == NOCRED && cred != NOCRED) 1338 rabp->b_rcred = crhold(cred); 1339 vfs_busy_pages(rabp, 0); 1340 BUF_KERNPROC(rabp); 1341 rabp->b_iooffset = dbtob(rabp->b_blkno); 1342 bstrategy(rabp); 1343 } else { 1344 brelse(rabp); 1345 } 1346 } 1347 } 1348 1349 /* 1350 * Entry point for bread() and breadn() via #defines in sys/buf.h. 1351 * 1352 * Get a buffer with the specified data. Look in the cache first. We 1353 * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE 1354 * is set, the buffer is valid and we do not have to do anything, see 1355 * getblk(). Also starts asynchronous I/O on read-ahead blocks. 1356 */ 1357 int 1358 breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, 1359 int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) 1360 { 1361 struct buf *bp; 1362 int rv = 0, readwait = 0; 1363 1364 CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); 1365 /* 1366 * Can only return NULL if GB_LOCK_NOWAIT flag is specified. 1367 */ 1368 *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); 1369 if (bp == NULL) 1370 return (EBUSY); 1371 1372 /* if not found in cache, do some I/O */ 1373 if ((bp->b_flags & B_CACHE) == 0) { 1374 if (!TD_IS_IDLETHREAD(curthread)) 1375 curthread->td_ru.ru_inblock++; 1376 bp->b_iocmd = BIO_READ; 1377 bp->b_flags &= ~B_INVAL; 1378 bp->b_ioflags &= ~BIO_ERROR; 1379 if (bp->b_rcred == NOCRED && cred != NOCRED) 1380 bp->b_rcred = crhold(cred); 1381 vfs_busy_pages(bp, 0); 1382 bp->b_iooffset = dbtob(bp->b_blkno); 1383 bstrategy(bp); 1384 ++readwait; 1385 } 1386 1387 breada(vp, rablkno, rabsize, cnt, cred); 1388 1389 if (readwait) { 1390 rv = bufwait(bp); 1391 } 1392 return (rv); 1393 } 1394 1395 /* 1396 * Write, release buffer on completion. (Done by iodone 1397 * if async). Do not bother writing anything if the buffer 1398 * is invalid. 1399 * 1400 * Note that we set B_CACHE here, indicating that buffer is 1401 * fully valid and thus cacheable. This is true even of NFS 1402 * now so we set it generally. This could be set either here 1403 * or in biodone() since the I/O is synchronous. We put it 1404 * here. 1405 */ 1406 int 1407 bufwrite(struct buf *bp) 1408 { 1409 int oldflags; 1410 struct vnode *vp; 1411 long space; 1412 int vp_md; 1413 1414 CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1415 if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { 1416 bp->b_flags |= B_INVAL | B_RELBUF; 1417 bp->b_flags &= ~B_CACHE; 1418 brelse(bp); 1419 return (ENXIO); 1420 } 1421 if (bp->b_flags & B_INVAL) { 1422 brelse(bp); 1423 return (0); 1424 } 1425 1426 if (bp->b_flags & B_BARRIER) 1427 barrierwrites++; 1428 1429 oldflags = bp->b_flags; 1430 1431 BUF_ASSERT_HELD(bp); 1432 1433 if (bp->b_pin_count > 0) 1434 bunpin_wait(bp); 1435 1436 KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), 1437 ("FFS background buffer should not get here %p", bp)); 1438 1439 vp = bp->b_vp; 1440 if (vp) 1441 vp_md = vp->v_vflag & VV_MD; 1442 else 1443 vp_md = 0; 1444 1445 /* 1446 * Mark the buffer clean. Increment the bufobj write count 1447 * before bundirty() call, to prevent other thread from seeing 1448 * empty dirty list and zero counter for writes in progress, 1449 * falsely indicating that the bufobj is clean. 1450 */ 1451 bufobj_wref(bp->b_bufobj); 1452 bundirty(bp); 1453 1454 bp->b_flags &= ~B_DONE; 1455 bp->b_ioflags &= ~BIO_ERROR; 1456 bp->b_flags |= B_CACHE; 1457 bp->b_iocmd = BIO_WRITE; 1458 1459 vfs_busy_pages(bp, 1); 1460 1461 /* 1462 * Normal bwrites pipeline writes 1463 */ 1464 bp->b_runningbufspace = bp->b_bufsize; 1465 space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); 1466 1467 if (!TD_IS_IDLETHREAD(curthread)) 1468 curthread->td_ru.ru_oublock++; 1469 if (oldflags & B_ASYNC) 1470 BUF_KERNPROC(bp); 1471 bp->b_iooffset = dbtob(bp->b_blkno); 1472 bstrategy(bp); 1473 1474 if ((oldflags & B_ASYNC) == 0) { 1475 int rtval = bufwait(bp); 1476 brelse(bp); 1477 return (rtval); 1478 } else if (space > hirunningspace) { 1479 /* 1480 * don't allow the async write to saturate the I/O 1481 * system. We will not deadlock here because 1482 * we are blocking waiting for I/O that is already in-progress 1483 * to complete. We do not block here if it is the update 1484 * or syncer daemon trying to clean up as that can lead 1485 * to deadlock. 1486 */ 1487 if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) 1488 waitrunningbufspace(); 1489 } 1490 1491 return (0); 1492 } 1493 1494 void 1495 bufbdflush(struct bufobj *bo, struct buf *bp) 1496 { 1497 struct buf *nbp; 1498 1499 if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { 1500 (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); 1501 altbufferflushes++; 1502 } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { 1503 BO_LOCK(bo); 1504 /* 1505 * Try to find a buffer to flush. 1506 */ 1507 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { 1508 if ((nbp->b_vflags & BV_BKGRDINPROG) || 1509 BUF_LOCK(nbp, 1510 LK_EXCLUSIVE | LK_NOWAIT, NULL)) 1511 continue; 1512 if (bp == nbp) 1513 panic("bdwrite: found ourselves"); 1514 BO_UNLOCK(bo); 1515 /* Don't countdeps with the bo lock held. */ 1516 if (buf_countdeps(nbp, 0)) { 1517 BO_LOCK(bo); 1518 BUF_UNLOCK(nbp); 1519 continue; 1520 } 1521 if (nbp->b_flags & B_CLUSTEROK) { 1522 vfs_bio_awrite(nbp); 1523 } else { 1524 bremfree(nbp); 1525 bawrite(nbp); 1526 } 1527 dirtybufferflushes++; 1528 break; 1529 } 1530 if (nbp == NULL) 1531 BO_UNLOCK(bo); 1532 } 1533 } 1534 1535 /* 1536 * Delayed write. (Buffer is marked dirty). Do not bother writing 1537 * anything if the buffer is marked invalid. 1538 * 1539 * Note that since the buffer must be completely valid, we can safely 1540 * set B_CACHE. In fact, we have to set B_CACHE here rather then in 1541 * biodone() in order to prevent getblk from writing the buffer 1542 * out synchronously. 1543 */ 1544 void 1545 bdwrite(struct buf *bp) 1546 { 1547 struct thread *td = curthread; 1548 struct vnode *vp; 1549 struct bufobj *bo; 1550 1551 CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1552 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1553 KASSERT((bp->b_flags & B_BARRIER) == 0, 1554 ("Barrier request in delayed write %p", bp)); 1555 BUF_ASSERT_HELD(bp); 1556 1557 if (bp->b_flags & B_INVAL) { 1558 brelse(bp); 1559 return; 1560 } 1561 1562 /* 1563 * If we have too many dirty buffers, don't create any more. 1564 * If we are wildly over our limit, then force a complete 1565 * cleanup. Otherwise, just keep the situation from getting 1566 * out of control. Note that we have to avoid a recursive 1567 * disaster and not try to clean up after our own cleanup! 1568 */ 1569 vp = bp->b_vp; 1570 bo = bp->b_bufobj; 1571 if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { 1572 td->td_pflags |= TDP_INBDFLUSH; 1573 BO_BDFLUSH(bo, bp); 1574 td->td_pflags &= ~TDP_INBDFLUSH; 1575 } else 1576 recursiveflushes++; 1577 1578 bdirty(bp); 1579 /* 1580 * Set B_CACHE, indicating that the buffer is fully valid. This is 1581 * true even of NFS now. 1582 */ 1583 bp->b_flags |= B_CACHE; 1584 1585 /* 1586 * This bmap keeps the system from needing to do the bmap later, 1587 * perhaps when the system is attempting to do a sync. Since it 1588 * is likely that the indirect block -- or whatever other datastructure 1589 * that the filesystem needs is still in memory now, it is a good 1590 * thing to do this. Note also, that if the pageout daemon is 1591 * requesting a sync -- there might not be enough memory to do 1592 * the bmap then... So, this is important to do. 1593 */ 1594 if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { 1595 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 1596 } 1597 1598 /* 1599 * Set the *dirty* buffer range based upon the VM system dirty 1600 * pages. 1601 * 1602 * Mark the buffer pages as clean. We need to do this here to 1603 * satisfy the vnode_pager and the pageout daemon, so that it 1604 * thinks that the pages have been "cleaned". Note that since 1605 * the pages are in a delayed write buffer -- the VFS layer 1606 * "will" see that the pages get written out on the next sync, 1607 * or perhaps the cluster will be completed. 1608 */ 1609 vfs_clean_pages_dirty_buf(bp); 1610 bqrelse(bp); 1611 1612 /* 1613 * note: we cannot initiate I/O from a bdwrite even if we wanted to, 1614 * due to the softdep code. 1615 */ 1616 } 1617 1618 /* 1619 * bdirty: 1620 * 1621 * Turn buffer into delayed write request. We must clear BIO_READ and 1622 * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to 1623 * itself to properly update it in the dirty/clean lists. We mark it 1624 * B_DONE to ensure that any asynchronization of the buffer properly 1625 * clears B_DONE ( else a panic will occur later ). 1626 * 1627 * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which 1628 * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() 1629 * should only be called if the buffer is known-good. 1630 * 1631 * Since the buffer is not on a queue, we do not update the numfreebuffers 1632 * count. 1633 * 1634 * The buffer must be on QUEUE_NONE. 1635 */ 1636 void 1637 bdirty(struct buf *bp) 1638 { 1639 1640 CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", 1641 bp, bp->b_vp, bp->b_flags); 1642 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1643 KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, 1644 ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1645 BUF_ASSERT_HELD(bp); 1646 bp->b_flags &= ~(B_RELBUF); 1647 bp->b_iocmd = BIO_WRITE; 1648 1649 if ((bp->b_flags & B_DELWRI) == 0) { 1650 bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; 1651 reassignbuf(bp); 1652 bdirtyadd(); 1653 } 1654 } 1655 1656 /* 1657 * bundirty: 1658 * 1659 * Clear B_DELWRI for buffer. 1660 * 1661 * Since the buffer is not on a queue, we do not update the numfreebuffers 1662 * count. 1663 * 1664 * The buffer must be on QUEUE_NONE. 1665 */ 1666 1667 void 1668 bundirty(struct buf *bp) 1669 { 1670 1671 CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1672 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1673 KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, 1674 ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1675 BUF_ASSERT_HELD(bp); 1676 1677 if (bp->b_flags & B_DELWRI) { 1678 bp->b_flags &= ~B_DELWRI; 1679 reassignbuf(bp); 1680 bdirtysub(); 1681 } 1682 /* 1683 * Since it is now being written, we can clear its deferred write flag. 1684 */ 1685 bp->b_flags &= ~B_DEFERRED; 1686 } 1687 1688 /* 1689 * bawrite: 1690 * 1691 * Asynchronous write. Start output on a buffer, but do not wait for 1692 * it to complete. The buffer is released when the output completes. 1693 * 1694 * bwrite() ( or the VOP routine anyway ) is responsible for handling 1695 * B_INVAL buffers. Not us. 1696 */ 1697 void 1698 bawrite(struct buf *bp) 1699 { 1700 1701 bp->b_flags |= B_ASYNC; 1702 (void) bwrite(bp); 1703 } 1704 1705 /* 1706 * babarrierwrite: 1707 * 1708 * Asynchronous barrier write. Start output on a buffer, but do not 1709 * wait for it to complete. Place a write barrier after this write so 1710 * that this buffer and all buffers written before it are committed to 1711 * the disk before any buffers written after this write are committed 1712 * to the disk. The buffer is released when the output completes. 1713 */ 1714 void 1715 babarrierwrite(struct buf *bp) 1716 { 1717 1718 bp->b_flags |= B_ASYNC | B_BARRIER; 1719 (void) bwrite(bp); 1720 } 1721 1722 /* 1723 * bbarrierwrite: 1724 * 1725 * Synchronous barrier write. Start output on a buffer and wait for 1726 * it to complete. Place a write barrier after this write so that 1727 * this buffer and all buffers written before it are committed to 1728 * the disk before any buffers written after this write are committed 1729 * to the disk. The buffer is released when the output completes. 1730 */ 1731 int 1732 bbarrierwrite(struct buf *bp) 1733 { 1734 1735 bp->b_flags |= B_BARRIER; 1736 return (bwrite(bp)); 1737 } 1738 1739 /* 1740 * bwillwrite: 1741 * 1742 * Called prior to the locking of any vnodes when we are expecting to 1743 * write. We do not want to starve the buffer cache with too many 1744 * dirty buffers so we block here. By blocking prior to the locking 1745 * of any vnodes we attempt to avoid the situation where a locked vnode 1746 * prevents the various system daemons from flushing related buffers. 1747 */ 1748 void 1749 bwillwrite(void) 1750 { 1751 1752 if (numdirtybuffers >= hidirtybuffers) { 1753 mtx_lock(&bdirtylock); 1754 while (numdirtybuffers >= hidirtybuffers) { 1755 bdirtywait = 1; 1756 msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), 1757 "flswai", 0); 1758 } 1759 mtx_unlock(&bdirtylock); 1760 } 1761 } 1762 1763 /* 1764 * Return true if we have too many dirty buffers. 1765 */ 1766 int 1767 buf_dirty_count_severe(void) 1768 { 1769 1770 return(numdirtybuffers >= hidirtybuffers); 1771 } 1772 1773 /* 1774 * brelse: 1775 * 1776 * Release a busy buffer and, if requested, free its resources. The 1777 * buffer will be stashed in the appropriate bufqueue[] allowing it 1778 * to be accessed later as a cache entity or reused for other purposes. 1779 */ 1780 void 1781 brelse(struct buf *bp) 1782 { 1783 int qindex; 1784 1785 CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", 1786 bp, bp->b_vp, bp->b_flags); 1787 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 1788 ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1789 1790 if (BUF_LOCKRECURSED(bp)) { 1791 /* 1792 * Do not process, in particular, do not handle the 1793 * B_INVAL/B_RELBUF and do not release to free list. 1794 */ 1795 BUF_UNLOCK(bp); 1796 return; 1797 } 1798 1799 if (bp->b_flags & B_MANAGED) { 1800 bqrelse(bp); 1801 return; 1802 } 1803 1804 if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { 1805 BO_LOCK(bp->b_bufobj); 1806 bp->b_vflags &= ~BV_BKGRDERR; 1807 BO_UNLOCK(bp->b_bufobj); 1808 bdirty(bp); 1809 } 1810 if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && 1811 bp->b_error == EIO && !(bp->b_flags & B_INVAL)) { 1812 /* 1813 * Failed write, redirty. Must clear BIO_ERROR to prevent 1814 * pages from being scrapped. If the error is anything 1815 * other than an I/O error (EIO), assume that retrying 1816 * is futile. 1817 */ 1818 bp->b_ioflags &= ~BIO_ERROR; 1819 bdirty(bp); 1820 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || 1821 (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { 1822 /* 1823 * Either a failed I/O or we were asked to free or not 1824 * cache the buffer. 1825 */ 1826 bp->b_flags |= B_INVAL; 1827 if (!LIST_EMPTY(&bp->b_dep)) 1828 buf_deallocate(bp); 1829 if (bp->b_flags & B_DELWRI) 1830 bdirtysub(); 1831 bp->b_flags &= ~(B_DELWRI | B_CACHE); 1832 if ((bp->b_flags & B_VMIO) == 0) { 1833 if (bp->b_bufsize) 1834 allocbuf(bp, 0); 1835 if (bp->b_vp) 1836 brelvp(bp); 1837 } 1838 } 1839 1840 /* 1841 * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() 1842 * is called with B_DELWRI set, the underlying pages may wind up 1843 * getting freed causing a previous write (bdwrite()) to get 'lost' 1844 * because pages associated with a B_DELWRI bp are marked clean. 1845 * 1846 * We still allow the B_INVAL case to call vfs_vmio_release(), even 1847 * if B_DELWRI is set. 1848 */ 1849 if (bp->b_flags & B_DELWRI) 1850 bp->b_flags &= ~B_RELBUF; 1851 1852 /* 1853 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 1854 * constituted, not even NFS buffers now. Two flags effect this. If 1855 * B_INVAL, the struct buf is invalidated but the VM object is kept 1856 * around ( i.e. so it is trivial to reconstitute the buffer later ). 1857 * 1858 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be 1859 * invalidated. BIO_ERROR cannot be set for a failed write unless the 1860 * buffer is also B_INVAL because it hits the re-dirtying code above. 1861 * 1862 * Normally we can do this whether a buffer is B_DELWRI or not. If 1863 * the buffer is an NFS buffer, it is tracking piecemeal writes or 1864 * the commit state and we cannot afford to lose the buffer. If the 1865 * buffer has a background write in progress, we need to keep it 1866 * around to prevent it from being reconstituted and starting a second 1867 * background write. 1868 */ 1869 if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || 1870 (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && 1871 !(bp->b_vp->v_mount != NULL && 1872 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && 1873 !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) 1874 vfs_vmio_invalidate(bp); 1875 1876 if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) { 1877 if (bp->b_flags & B_VMIO) 1878 vfs_vmio_release(bp); 1879 if (bp->b_bufsize != 0) 1880 allocbuf(bp, 0); 1881 if (bp->b_vp != NULL) 1882 brelvp(bp); 1883 } 1884 1885 /* 1886 * If the buffer has junk contents signal it and eventually 1887 * clean up B_DELWRI and diassociate the vnode so that gbincore() 1888 * doesn't find it. 1889 */ 1890 if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || 1891 (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) 1892 bp->b_flags |= B_INVAL; 1893 if (bp->b_flags & B_INVAL) { 1894 if (bp->b_flags & B_DELWRI) 1895 bundirty(bp); 1896 if (bp->b_vp) 1897 brelvp(bp); 1898 } 1899 1900 /* buffers with no memory */ 1901 if (bp->b_bufsize == 0) { 1902 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); 1903 if (bp->b_vflags & BV_BKGRDINPROG) 1904 panic("losing buffer 1"); 1905 bufkvafree(bp); 1906 qindex = QUEUE_EMPTY; 1907 bp->b_flags |= B_AGE; 1908 /* buffers with junk contents */ 1909 } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || 1910 (bp->b_ioflags & BIO_ERROR)) { 1911 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); 1912 if (bp->b_vflags & BV_BKGRDINPROG) 1913 panic("losing buffer 2"); 1914 qindex = QUEUE_CLEAN; 1915 bp->b_flags |= B_AGE; 1916 /* remaining buffers */ 1917 } else if (bp->b_flags & B_DELWRI) 1918 qindex = QUEUE_DIRTY; 1919 else 1920 qindex = QUEUE_CLEAN; 1921 1922 binsfree(bp, qindex); 1923 1924 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); 1925 if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) 1926 panic("brelse: not dirty"); 1927 /* unlock */ 1928 BUF_UNLOCK(bp); 1929 } 1930 1931 /* 1932 * Release a buffer back to the appropriate queue but do not try to free 1933 * it. The buffer is expected to be used again soon. 1934 * 1935 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 1936 * biodone() to requeue an async I/O on completion. It is also used when 1937 * known good buffers need to be requeued but we think we may need the data 1938 * again soon. 1939 * 1940 * XXX we should be able to leave the B_RELBUF hint set on completion. 1941 */ 1942 void 1943 bqrelse(struct buf *bp) 1944 { 1945 int qindex; 1946 1947 CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1948 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 1949 ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1950 1951 if (BUF_LOCKRECURSED(bp)) { 1952 /* do not release to free list */ 1953 BUF_UNLOCK(bp); 1954 return; 1955 } 1956 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 1957 1958 if (bp->b_flags & B_MANAGED) { 1959 if (bp->b_flags & B_REMFREE) 1960 bremfreef(bp); 1961 goto out; 1962 } 1963 1964 /* buffers with stale but valid contents */ 1965 if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | 1966 BV_BKGRDERR)) == BV_BKGRDERR) { 1967 BO_LOCK(bp->b_bufobj); 1968 bp->b_vflags &= ~BV_BKGRDERR; 1969 BO_UNLOCK(bp->b_bufobj); 1970 qindex = QUEUE_DIRTY; 1971 } else { 1972 if ((bp->b_flags & B_DELWRI) == 0 && 1973 (bp->b_xflags & BX_VNDIRTY)) 1974 panic("bqrelse: not dirty"); 1975 qindex = QUEUE_CLEAN; 1976 } 1977 binsfree(bp, qindex); 1978 1979 out: 1980 /* unlock */ 1981 BUF_UNLOCK(bp); 1982 } 1983 1984 /* 1985 * Complete I/O to a VMIO backed page. Validate the pages as appropriate, 1986 * restore bogus pages. 1987 */ 1988 static void 1989 vfs_vmio_iodone(struct buf *bp) 1990 { 1991 vm_ooffset_t foff; 1992 vm_page_t m; 1993 vm_object_t obj; 1994 struct vnode *vp; 1995 int bogus, i, iosize; 1996 1997 obj = bp->b_bufobj->bo_object; 1998 KASSERT(obj->paging_in_progress >= bp->b_npages, 1999 ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", 2000 obj->paging_in_progress, bp->b_npages)); 2001 2002 vp = bp->b_vp; 2003 KASSERT(vp->v_holdcnt > 0, 2004 ("vfs_vmio_iodone: vnode %p has zero hold count", vp)); 2005 KASSERT(vp->v_object != NULL, 2006 ("vfs_vmio_iodone: vnode %p has no vm_object", vp)); 2007 2008 foff = bp->b_offset; 2009 KASSERT(bp->b_offset != NOOFFSET, 2010 ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); 2011 2012 bogus = 0; 2013 iosize = bp->b_bcount - bp->b_resid; 2014 VM_OBJECT_WLOCK(obj); 2015 for (i = 0; i < bp->b_npages; i++) { 2016 int resid; 2017 2018 resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; 2019 if (resid > iosize) 2020 resid = iosize; 2021 2022 /* 2023 * cleanup bogus pages, restoring the originals 2024 */ 2025 m = bp->b_pages[i]; 2026 if (m == bogus_page) { 2027 bogus = 1; 2028 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 2029 if (m == NULL) 2030 panic("biodone: page disappeared!"); 2031 bp->b_pages[i] = m; 2032 } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { 2033 /* 2034 * In the write case, the valid and clean bits are 2035 * already changed correctly ( see bdwrite() ), so we 2036 * only need to do this here in the read case. 2037 */ 2038 KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, 2039 resid)) == 0, ("vfs_vmio_iodone: page %p " 2040 "has unexpected dirty bits", m)); 2041 vfs_page_set_valid(bp, foff, m); 2042 } 2043 KASSERT(OFF_TO_IDX(foff) == m->pindex, 2044 ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", 2045 (intmax_t)foff, (uintmax_t)m->pindex)); 2046 2047 vm_page_sunbusy(m); 2048 vm_object_pip_subtract(obj, 1); 2049 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 2050 iosize -= resid; 2051 } 2052 vm_object_pip_wakeupn(obj, 0); 2053 VM_OBJECT_WUNLOCK(obj); 2054 if (bogus && buf_mapped(bp)) { 2055 BUF_CHECK_MAPPED(bp); 2056 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 2057 bp->b_pages, bp->b_npages); 2058 } 2059 } 2060 2061 /* 2062 * Perform page invalidation when a buffer is released. The fully invalid 2063 * pages will be reclaimed later in vfs_vmio_release(). 2064 */ 2065 static void 2066 vfs_vmio_invalidate(struct buf *bp) 2067 { 2068 vm_object_t obj; 2069 vm_page_t m; 2070 int i, resid, poffset, presid; 2071 2072 /* 2073 * Get the base offset and length of the buffer. Note that 2074 * in the VMIO case if the buffer block size is not 2075 * page-aligned then b_data pointer may not be page-aligned. 2076 * But our b_pages[] array *IS* page aligned. 2077 * 2078 * block sizes less then DEV_BSIZE (usually 512) are not 2079 * supported due to the page granularity bits (m->valid, 2080 * m->dirty, etc...). 2081 * 2082 * See man buf(9) for more information 2083 */ 2084 obj = bp->b_bufobj->bo_object; 2085 resid = bp->b_bufsize; 2086 poffset = bp->b_offset & PAGE_MASK; 2087 VM_OBJECT_WLOCK(obj); 2088 for (i = 0; i < bp->b_npages; i++) { 2089 m = bp->b_pages[i]; 2090 if (m == bogus_page) 2091 panic("vfs_vmio_invalidate: Unexpected bogus page."); 2092 2093 presid = resid > (PAGE_SIZE - poffset) ? 2094 (PAGE_SIZE - poffset) : resid; 2095 KASSERT(presid >= 0, ("brelse: extra page")); 2096 while (vm_page_xbusied(m)) { 2097 vm_page_lock(m); 2098 VM_OBJECT_WUNLOCK(obj); 2099 vm_page_busy_sleep(m, "mbncsh"); 2100 VM_OBJECT_WLOCK(obj); 2101 } 2102 if (pmap_page_wired_mappings(m) == 0) 2103 vm_page_set_invalid(m, poffset, presid); 2104 resid -= presid; 2105 poffset = 0; 2106 } 2107 VM_OBJECT_WUNLOCK(obj); 2108 } 2109 2110 /* Give pages used by the bp back to the VM system (where possible) */ 2111 static void 2112 vfs_vmio_release(struct buf *bp) 2113 { 2114 vm_object_t obj; 2115 vm_page_t m; 2116 int i; 2117 bool freed; 2118 2119 if (buf_mapped(bp)) { 2120 BUF_CHECK_MAPPED(bp); 2121 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); 2122 } else 2123 BUF_CHECK_UNMAPPED(bp); 2124 obj = bp->b_bufobj->bo_object; 2125 if (obj != NULL) 2126 VM_OBJECT_WLOCK(obj); 2127 for (i = 0; i < bp->b_npages; i++) { 2128 m = bp->b_pages[i]; 2129 bp->b_pages[i] = NULL; 2130 vm_page_lock(m); 2131 if (vm_page_unwire(m, PQ_NONE)) { 2132 /* 2133 * Determine if the page should be freed before adding 2134 * it to the inactive queue. 2135 */ 2136 if ((bp->b_flags & B_ASYNC) == 0 && m->valid == 0) { 2137 freed = !vm_page_busied(m); 2138 if (freed) 2139 vm_page_free(m); 2140 } else if ((bp->b_flags & B_DIRECT) != 0) 2141 freed = vm_page_try_to_free(m); 2142 else 2143 freed = false; 2144 if (!freed) { 2145 /* 2146 * In order to maintain LRU page ordering, put 2147 * the page at the tail of the inactive queue. 2148 */ 2149 vm_page_deactivate(m); 2150 } 2151 } 2152 vm_page_unlock(m); 2153 } 2154 if (obj != NULL) 2155 VM_OBJECT_WUNLOCK(obj); 2156 2157 if (bp->b_bufsize) 2158 bufspaceadjust(bp, 0); 2159 bp->b_npages = 0; 2160 bp->b_flags &= ~B_VMIO; 2161 } 2162 2163 /* 2164 * Page-granular truncation of an existing VMIO buffer. 2165 */ 2166 static void 2167 vfs_vmio_truncate(struct buf *bp, int desiredpages) 2168 { 2169 vm_page_t m; 2170 int i; 2171 2172 if (bp->b_npages == desiredpages) 2173 return; 2174 2175 if (buf_mapped(bp)) { 2176 BUF_CHECK_MAPPED(bp); 2177 pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + 2178 (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); 2179 } else 2180 BUF_CHECK_UNMAPPED(bp); 2181 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 2182 for (i = desiredpages; i < bp->b_npages; i++) { 2183 /* 2184 * The page is not freed here -- it is the responsibility of 2185 * vnode_pager_setsize. 2186 */ 2187 m = bp->b_pages[i]; 2188 KASSERT(m != bogus_page, ("allocbuf: bogus page found")); 2189 while (vm_page_sleep_if_busy(m, "biodep")) 2190 continue; 2191 bp->b_pages[i] = NULL; 2192 vm_page_lock(m); 2193 vm_page_unwire(m, PQ_INACTIVE); 2194 vm_page_unlock(m); 2195 } 2196 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 2197 bp->b_npages = desiredpages; 2198 } 2199 2200 /* 2201 * Byte granular extension of VMIO buffers. 2202 */ 2203 static void 2204 vfs_vmio_extend(struct buf *bp, int desiredpages, int size) 2205 { 2206 /* 2207 * We are growing the buffer, possibly in a 2208 * byte-granular fashion. 2209 */ 2210 vm_object_t obj; 2211 vm_offset_t toff; 2212 vm_offset_t tinc; 2213 vm_page_t m; 2214 2215 /* 2216 * Step 1, bring in the VM pages from the object, allocating 2217 * them if necessary. We must clear B_CACHE if these pages 2218 * are not valid for the range covered by the buffer. 2219 */ 2220 obj = bp->b_bufobj->bo_object; 2221 VM_OBJECT_WLOCK(obj); 2222 while (bp->b_npages < desiredpages) { 2223 /* 2224 * We must allocate system pages since blocking 2225 * here could interfere with paging I/O, no 2226 * matter which process we are. 2227 * 2228 * Only exclusive busy can be tested here. 2229 * Blocking on shared busy might lead to 2230 * deadlocks once allocbuf() is called after 2231 * pages are vfs_busy_pages(). 2232 */ 2233 m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, 2234 VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | 2235 VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | 2236 VM_ALLOC_COUNT(desiredpages - bp->b_npages)); 2237 if (m->valid == 0) 2238 bp->b_flags &= ~B_CACHE; 2239 bp->b_pages[bp->b_npages] = m; 2240 ++bp->b_npages; 2241 } 2242 2243 /* 2244 * Step 2. We've loaded the pages into the buffer, 2245 * we have to figure out if we can still have B_CACHE 2246 * set. Note that B_CACHE is set according to the 2247 * byte-granular range ( bcount and size ), not the 2248 * aligned range ( newbsize ). 2249 * 2250 * The VM test is against m->valid, which is DEV_BSIZE 2251 * aligned. Needless to say, the validity of the data 2252 * needs to also be DEV_BSIZE aligned. Note that this 2253 * fails with NFS if the server or some other client 2254 * extends the file's EOF. If our buffer is resized, 2255 * B_CACHE may remain set! XXX 2256 */ 2257 toff = bp->b_bcount; 2258 tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); 2259 while ((bp->b_flags & B_CACHE) && toff < size) { 2260 vm_pindex_t pi; 2261 2262 if (tinc > (size - toff)) 2263 tinc = size - toff; 2264 pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; 2265 m = bp->b_pages[pi]; 2266 vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); 2267 toff += tinc; 2268 tinc = PAGE_SIZE; 2269 } 2270 VM_OBJECT_WUNLOCK(obj); 2271 2272 /* 2273 * Step 3, fixup the KVA pmap. 2274 */ 2275 if (buf_mapped(bp)) 2276 bpmap_qenter(bp); 2277 else 2278 BUF_CHECK_UNMAPPED(bp); 2279 } 2280 2281 /* 2282 * Check to see if a block at a particular lbn is available for a clustered 2283 * write. 2284 */ 2285 static int 2286 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) 2287 { 2288 struct buf *bpa; 2289 int match; 2290 2291 match = 0; 2292 2293 /* If the buf isn't in core skip it */ 2294 if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) 2295 return (0); 2296 2297 /* If the buf is busy we don't want to wait for it */ 2298 if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 2299 return (0); 2300 2301 /* Only cluster with valid clusterable delayed write buffers */ 2302 if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != 2303 (B_DELWRI | B_CLUSTEROK)) 2304 goto done; 2305 2306 if (bpa->b_bufsize != size) 2307 goto done; 2308 2309 /* 2310 * Check to see if it is in the expected place on disk and that the 2311 * block has been mapped. 2312 */ 2313 if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) 2314 match = 1; 2315 done: 2316 BUF_UNLOCK(bpa); 2317 return (match); 2318 } 2319 2320 /* 2321 * vfs_bio_awrite: 2322 * 2323 * Implement clustered async writes for clearing out B_DELWRI buffers. 2324 * This is much better then the old way of writing only one buffer at 2325 * a time. Note that we may not be presented with the buffers in the 2326 * correct order, so we search for the cluster in both directions. 2327 */ 2328 int 2329 vfs_bio_awrite(struct buf *bp) 2330 { 2331 struct bufobj *bo; 2332 int i; 2333 int j; 2334 daddr_t lblkno = bp->b_lblkno; 2335 struct vnode *vp = bp->b_vp; 2336 int ncl; 2337 int nwritten; 2338 int size; 2339 int maxcl; 2340 int gbflags; 2341 2342 bo = &vp->v_bufobj; 2343 gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; 2344 /* 2345 * right now we support clustered writing only to regular files. If 2346 * we find a clusterable block we could be in the middle of a cluster 2347 * rather then at the beginning. 2348 */ 2349 if ((vp->v_type == VREG) && 2350 (vp->v_mount != 0) && /* Only on nodes that have the size info */ 2351 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 2352 2353 size = vp->v_mount->mnt_stat.f_iosize; 2354 maxcl = MAXPHYS / size; 2355 2356 BO_RLOCK(bo); 2357 for (i = 1; i < maxcl; i++) 2358 if (vfs_bio_clcheck(vp, size, lblkno + i, 2359 bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) 2360 break; 2361 2362 for (j = 1; i + j <= maxcl && j <= lblkno; j++) 2363 if (vfs_bio_clcheck(vp, size, lblkno - j, 2364 bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) 2365 break; 2366 BO_RUNLOCK(bo); 2367 --j; 2368 ncl = i + j; 2369 /* 2370 * this is a possible cluster write 2371 */ 2372 if (ncl != 1) { 2373 BUF_UNLOCK(bp); 2374 nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, 2375 gbflags); 2376 return (nwritten); 2377 } 2378 } 2379 bremfree(bp); 2380 bp->b_flags |= B_ASYNC; 2381 /* 2382 * default (old) behavior, writing out only one block 2383 * 2384 * XXX returns b_bufsize instead of b_bcount for nwritten? 2385 */ 2386 nwritten = bp->b_bufsize; 2387 (void) bwrite(bp); 2388 2389 return (nwritten); 2390 } 2391 2392 /* 2393 * Ask the bufdaemon for help, or act as bufdaemon itself, when a 2394 * locked vnode is supplied. 2395 */ 2396 static void 2397 getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, 2398 int defrag) 2399 { 2400 struct thread *td; 2401 char *waitmsg; 2402 int error, fl, flags, norunbuf; 2403 2404 mtx_assert(&bqclean, MA_OWNED); 2405 2406 if (defrag) { 2407 flags = VFS_BIO_NEED_BUFSPACE; 2408 waitmsg = "nbufkv"; 2409 } else if (bufspace >= hibufspace) { 2410 waitmsg = "nbufbs"; 2411 flags = VFS_BIO_NEED_BUFSPACE; 2412 } else { 2413 waitmsg = "newbuf"; 2414 flags = VFS_BIO_NEED_ANY; 2415 } 2416 atomic_set_int(&needsbuffer, flags); 2417 mtx_unlock(&bqclean); 2418 2419 bd_speedup(); /* heeeelp */ 2420 if ((gbflags & GB_NOWAIT_BD) != 0) 2421 return; 2422 2423 td = curthread; 2424 rw_wlock(&nblock); 2425 while ((needsbuffer & flags) != 0) { 2426 if (vp != NULL && vp->v_type != VCHR && 2427 (td->td_pflags & TDP_BUFNEED) == 0) { 2428 rw_wunlock(&nblock); 2429 /* 2430 * getblk() is called with a vnode locked, and 2431 * some majority of the dirty buffers may as 2432 * well belong to the vnode. Flushing the 2433 * buffers there would make a progress that 2434 * cannot be achieved by the buf_daemon, that 2435 * cannot lock the vnode. 2436 */ 2437 norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | 2438 (td->td_pflags & TDP_NORUNNINGBUF); 2439 2440 /* 2441 * Play bufdaemon. The getnewbuf() function 2442 * may be called while the thread owns lock 2443 * for another dirty buffer for the same 2444 * vnode, which makes it impossible to use 2445 * VOP_FSYNC() there, due to the buffer lock 2446 * recursion. 2447 */ 2448 td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; 2449 fl = buf_flush(vp, flushbufqtarget); 2450 td->td_pflags &= norunbuf; 2451 rw_wlock(&nblock); 2452 if (fl != 0) 2453 continue; 2454 if ((needsbuffer & flags) == 0) 2455 break; 2456 } 2457 error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, 2458 (PRIBIO + 4) | slpflag, waitmsg, slptimeo); 2459 if (error != 0) 2460 break; 2461 } 2462 rw_wunlock(&nblock); 2463 } 2464 2465 static void 2466 getnewbuf_reuse_bp(struct buf *bp, int qindex) 2467 { 2468 2469 CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " 2470 "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, 2471 bp->b_kvasize, bp->b_bufsize, qindex); 2472 mtx_assert(&bqclean, MA_NOTOWNED); 2473 2474 /* 2475 * Note: we no longer distinguish between VMIO and non-VMIO 2476 * buffers. 2477 */ 2478 KASSERT((bp->b_flags & B_DELWRI) == 0, 2479 ("delwri buffer %p found in queue %d", bp, qindex)); 2480 2481 if (qindex == QUEUE_CLEAN) { 2482 if (bp->b_flags & B_VMIO) { 2483 bp->b_flags &= ~B_ASYNC; 2484 vfs_vmio_release(bp); 2485 } 2486 if (bp->b_vp != NULL) 2487 brelvp(bp); 2488 } 2489 2490 /* 2491 * Get the rest of the buffer freed up. b_kva* is still valid 2492 * after this operation. 2493 */ 2494 2495 if (bp->b_rcred != NOCRED) { 2496 crfree(bp->b_rcred); 2497 bp->b_rcred = NOCRED; 2498 } 2499 if (bp->b_wcred != NOCRED) { 2500 crfree(bp->b_wcred); 2501 bp->b_wcred = NOCRED; 2502 } 2503 if (!LIST_EMPTY(&bp->b_dep)) 2504 buf_deallocate(bp); 2505 if (bp->b_vflags & BV_BKGRDINPROG) 2506 panic("losing buffer 3"); 2507 KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d", 2508 bp, bp->b_vp, qindex)); 2509 KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, 2510 ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); 2511 2512 if (bp->b_bufsize) 2513 allocbuf(bp, 0); 2514 2515 bp->b_flags = 0; 2516 bp->b_ioflags = 0; 2517 bp->b_xflags = 0; 2518 KASSERT((bp->b_flags & B_INFREECNT) == 0, 2519 ("buf %p still counted as free?", bp)); 2520 bp->b_vflags = 0; 2521 bp->b_vp = NULL; 2522 bp->b_blkno = bp->b_lblkno = 0; 2523 bp->b_offset = NOOFFSET; 2524 bp->b_iodone = 0; 2525 bp->b_error = 0; 2526 bp->b_resid = 0; 2527 bp->b_bcount = 0; 2528 bp->b_npages = 0; 2529 bp->b_dirtyoff = bp->b_dirtyend = 0; 2530 bp->b_bufobj = NULL; 2531 bp->b_pin_count = 0; 2532 bp->b_data = bp->b_kvabase; 2533 bp->b_fsprivate1 = NULL; 2534 bp->b_fsprivate2 = NULL; 2535 bp->b_fsprivate3 = NULL; 2536 2537 LIST_INIT(&bp->b_dep); 2538 } 2539 2540 static struct buf * 2541 getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata) 2542 { 2543 struct buf *bp, *nbp; 2544 int nqindex, qindex, pass; 2545 2546 KASSERT(!unmapped || !defrag, ("both unmapped and defrag")); 2547 2548 pass = 0; 2549 restart: 2550 if (pass != 0) 2551 atomic_add_int(&getnewbufrestarts, 1); 2552 2553 nbp = NULL; 2554 mtx_lock(&bqclean); 2555 /* 2556 * If we're not defragging or low on bufspace attempt to make a new 2557 * buf from a header. 2558 */ 2559 if (defrag == 0 && bufspace + maxsize < hibufspace) { 2560 nqindex = QUEUE_EMPTY; 2561 nbp = TAILQ_FIRST(&bufqueues[nqindex]); 2562 } 2563 /* 2564 * All available buffers might be clean or we need to start recycling. 2565 */ 2566 if (nbp == NULL) { 2567 nqindex = QUEUE_CLEAN; 2568 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); 2569 } 2570 2571 /* 2572 * Run scan, possibly freeing data and/or kva mappings on the fly 2573 * depending. 2574 */ 2575 while ((bp = nbp) != NULL) { 2576 qindex = nqindex; 2577 2578 /* 2579 * Calculate next bp (we can only use it if we do not 2580 * release the bqlock) 2581 */ 2582 if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { 2583 switch (qindex) { 2584 case QUEUE_EMPTY: 2585 nqindex = QUEUE_CLEAN; 2586 nbp = TAILQ_FIRST(&bufqueues[nqindex]); 2587 if (nbp != NULL) 2588 break; 2589 /* FALLTHROUGH */ 2590 case QUEUE_CLEAN: 2591 if (metadata && pass == 0) { 2592 pass = 1; 2593 nqindex = QUEUE_EMPTY; 2594 nbp = TAILQ_FIRST(&bufqueues[nqindex]); 2595 } 2596 /* 2597 * nbp is NULL. 2598 */ 2599 break; 2600 } 2601 } 2602 /* 2603 * If we are defragging then we need a buffer with 2604 * b_kvasize != 0. This situation occurs when we 2605 * have many unmapped bufs. 2606 */ 2607 if (defrag && bp->b_kvasize == 0) 2608 continue; 2609 2610 /* 2611 * Start freeing the bp. This is somewhat involved. nbp 2612 * remains valid only for QUEUE_EMPTY[KVA] bp's. 2613 */ 2614 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 2615 continue; 2616 /* 2617 * BKGRDINPROG can only be set with the buf and bufobj 2618 * locks both held. We tolerate a race to clear it here. 2619 */ 2620 if (bp->b_vflags & BV_BKGRDINPROG) { 2621 BUF_UNLOCK(bp); 2622 continue; 2623 } 2624 2625 /* 2626 * Requeue the background write buffer with error. 2627 */ 2628 if ((bp->b_vflags & BV_BKGRDERR) != 0) { 2629 bremfreel(bp); 2630 mtx_unlock(&bqclean); 2631 bqrelse(bp); 2632 continue; 2633 } 2634 2635 KASSERT(bp->b_qindex == qindex, 2636 ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); 2637 2638 bremfreel(bp); 2639 mtx_unlock(&bqclean); 2640 2641 /* 2642 * NOTE: nbp is now entirely invalid. We can only restart 2643 * the scan from this point on. 2644 */ 2645 getnewbuf_reuse_bp(bp, qindex); 2646 mtx_assert(&bqclean, MA_NOTOWNED); 2647 2648 /* 2649 * If we are defragging then free the buffer. 2650 */ 2651 if (defrag) { 2652 bp->b_flags |= B_INVAL; 2653 brelse(bp); 2654 defrag = 0; 2655 goto restart; 2656 } 2657 2658 /* 2659 * Notify any waiters for the buffer lock about 2660 * identity change by freeing the buffer. 2661 */ 2662 if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) { 2663 bp->b_flags |= B_INVAL; 2664 brelse(bp); 2665 goto restart; 2666 } 2667 2668 if (metadata) 2669 break; 2670 2671 /* 2672 * If we are overcomitted then recover the buffer and its 2673 * KVM space. This occurs in rare situations when multiple 2674 * processes are blocked in getnewbuf() or allocbuf(). 2675 */ 2676 if (bufspace >= hibufspace && bp->b_kvasize != 0) { 2677 bp->b_flags |= B_INVAL; 2678 brelse(bp); 2679 goto restart; 2680 } 2681 break; 2682 } 2683 return (bp); 2684 } 2685 2686 /* 2687 * getnewbuf: 2688 * 2689 * Find and initialize a new buffer header, freeing up existing buffers 2690 * in the bufqueues as necessary. The new buffer is returned locked. 2691 * 2692 * Important: B_INVAL is not set. If the caller wishes to throw the 2693 * buffer away, the caller must set B_INVAL prior to calling brelse(). 2694 * 2695 * We block if: 2696 * We have insufficient buffer headers 2697 * We have insufficient buffer space 2698 * buffer_arena is too fragmented ( space reservation fails ) 2699 * If we have to flush dirty buffers ( but we try to avoid this ) 2700 */ 2701 static struct buf * 2702 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, 2703 int gbflags) 2704 { 2705 struct buf *bp; 2706 int defrag, metadata; 2707 2708 KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, 2709 ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); 2710 if (!unmapped_buf_allowed) 2711 gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); 2712 2713 defrag = 0; 2714 if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || 2715 vp->v_type == VCHR) 2716 metadata = 1; 2717 else 2718 metadata = 0; 2719 /* 2720 * We can't afford to block since we might be holding a vnode lock, 2721 * which may prevent system daemons from running. We deal with 2722 * low-memory situations by proactively returning memory and running 2723 * async I/O rather then sync I/O. 2724 */ 2725 atomic_add_int(&getnewbufcalls, 1); 2726 restart: 2727 bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED | 2728 GB_KVAALLOC)) == GB_UNMAPPED, metadata); 2729 if (bp != NULL) 2730 defrag = 0; 2731 2732 /* 2733 * If we exhausted our list, sleep as appropriate. We may have to 2734 * wakeup various daemons and write out some dirty buffers. 2735 * 2736 * Generally we are sleeping due to insufficient buffer space. 2737 */ 2738 if (bp == NULL) { 2739 mtx_assert(&bqclean, MA_OWNED); 2740 getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag); 2741 mtx_assert(&bqclean, MA_NOTOWNED); 2742 } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) { 2743 mtx_assert(&bqclean, MA_NOTOWNED); 2744 2745 bufkvafree(bp); 2746 atomic_add_int(&bufreusecnt, 1); 2747 } else { 2748 mtx_assert(&bqclean, MA_NOTOWNED); 2749 2750 /* 2751 * We finally have a valid bp. We aren't quite out of the 2752 * woods, we still have to reserve kva space. In order to 2753 * keep fragmentation sane we only allocate kva in BKVASIZE 2754 * chunks. 2755 */ 2756 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; 2757 2758 if (maxsize != bp->b_kvasize && 2759 bufkvaalloc(bp, maxsize, gbflags)) { 2760 defrag = 1; 2761 bp->b_flags |= B_INVAL; 2762 brelse(bp); 2763 goto restart; 2764 } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 2765 (GB_UNMAPPED | GB_KVAALLOC)) { 2766 bp->b_data = unmapped_buf; 2767 BUF_CHECK_UNMAPPED(bp); 2768 } 2769 atomic_add_int(&bufreusecnt, 1); 2770 } 2771 return (bp); 2772 } 2773 2774 /* 2775 * buf_daemon: 2776 * 2777 * buffer flushing daemon. Buffers are normally flushed by the 2778 * update daemon but if it cannot keep up this process starts to 2779 * take the load in an attempt to prevent getnewbuf() from blocking. 2780 */ 2781 2782 static struct kproc_desc buf_kp = { 2783 "bufdaemon", 2784 buf_daemon, 2785 &bufdaemonproc 2786 }; 2787 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); 2788 2789 static int 2790 buf_flush(struct vnode *vp, int target) 2791 { 2792 int flushed; 2793 2794 flushed = flushbufqueues(vp, target, 0); 2795 if (flushed == 0) { 2796 /* 2797 * Could not find any buffers without rollback 2798 * dependencies, so just write the first one 2799 * in the hopes of eventually making progress. 2800 */ 2801 if (vp != NULL && target > 2) 2802 target /= 2; 2803 flushbufqueues(vp, target, 1); 2804 } 2805 return (flushed); 2806 } 2807 2808 static void 2809 buf_daemon() 2810 { 2811 int lodirty; 2812 2813 /* 2814 * This process needs to be suspended prior to shutdown sync. 2815 */ 2816 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, 2817 SHUTDOWN_PRI_LAST); 2818 2819 /* 2820 * This process is allowed to take the buffer cache to the limit 2821 */ 2822 curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; 2823 mtx_lock(&bdlock); 2824 for (;;) { 2825 bd_request = 0; 2826 mtx_unlock(&bdlock); 2827 2828 kproc_suspend_check(bufdaemonproc); 2829 lodirty = lodirtybuffers; 2830 if (bd_speedupreq) { 2831 lodirty = numdirtybuffers / 2; 2832 bd_speedupreq = 0; 2833 } 2834 /* 2835 * Do the flush. Limit the amount of in-transit I/O we 2836 * allow to build up, otherwise we would completely saturate 2837 * the I/O system. 2838 */ 2839 while (numdirtybuffers > lodirty) { 2840 if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) 2841 break; 2842 kern_yield(PRI_USER); 2843 } 2844 2845 /* 2846 * Only clear bd_request if we have reached our low water 2847 * mark. The buf_daemon normally waits 1 second and 2848 * then incrementally flushes any dirty buffers that have 2849 * built up, within reason. 2850 * 2851 * If we were unable to hit our low water mark and couldn't 2852 * find any flushable buffers, we sleep for a short period 2853 * to avoid endless loops on unlockable buffers. 2854 */ 2855 mtx_lock(&bdlock); 2856 if (numdirtybuffers <= lodirtybuffers) { 2857 /* 2858 * We reached our low water mark, reset the 2859 * request and sleep until we are needed again. 2860 * The sleep is just so the suspend code works. 2861 */ 2862 bd_request = 0; 2863 /* 2864 * Do an extra wakeup in case dirty threshold 2865 * changed via sysctl and the explicit transition 2866 * out of shortfall was missed. 2867 */ 2868 bdirtywakeup(); 2869 if (runningbufspace <= lorunningspace) 2870 runningwakeup(); 2871 msleep(&bd_request, &bdlock, PVM, "psleep", hz); 2872 } else { 2873 /* 2874 * We couldn't find any flushable dirty buffers but 2875 * still have too many dirty buffers, we 2876 * have to sleep and try again. (rare) 2877 */ 2878 msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); 2879 } 2880 } 2881 } 2882 2883 /* 2884 * flushbufqueues: 2885 * 2886 * Try to flush a buffer in the dirty queue. We must be careful to 2887 * free up B_INVAL buffers instead of write them, which NFS is 2888 * particularly sensitive to. 2889 */ 2890 static int flushwithdeps = 0; 2891 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 2892 0, "Number of buffers flushed with dependecies that require rollbacks"); 2893 2894 static int 2895 flushbufqueues(struct vnode *lvp, int target, int flushdeps) 2896 { 2897 struct buf *sentinel; 2898 struct vnode *vp; 2899 struct mount *mp; 2900 struct buf *bp; 2901 int hasdeps; 2902 int flushed; 2903 int queue; 2904 int error; 2905 bool unlock; 2906 2907 flushed = 0; 2908 queue = QUEUE_DIRTY; 2909 bp = NULL; 2910 sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); 2911 sentinel->b_qindex = QUEUE_SENTINEL; 2912 mtx_lock(&bqdirty); 2913 TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); 2914 mtx_unlock(&bqdirty); 2915 while (flushed != target) { 2916 maybe_yield(); 2917 mtx_lock(&bqdirty); 2918 bp = TAILQ_NEXT(sentinel, b_freelist); 2919 if (bp != NULL) { 2920 TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); 2921 TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, 2922 b_freelist); 2923 } else { 2924 mtx_unlock(&bqdirty); 2925 break; 2926 } 2927 /* 2928 * Skip sentinels inserted by other invocations of the 2929 * flushbufqueues(), taking care to not reorder them. 2930 * 2931 * Only flush the buffers that belong to the 2932 * vnode locked by the curthread. 2933 */ 2934 if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && 2935 bp->b_vp != lvp)) { 2936 mtx_unlock(&bqdirty); 2937 continue; 2938 } 2939 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); 2940 mtx_unlock(&bqdirty); 2941 if (error != 0) 2942 continue; 2943 if (bp->b_pin_count > 0) { 2944 BUF_UNLOCK(bp); 2945 continue; 2946 } 2947 /* 2948 * BKGRDINPROG can only be set with the buf and bufobj 2949 * locks both held. We tolerate a race to clear it here. 2950 */ 2951 if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || 2952 (bp->b_flags & B_DELWRI) == 0) { 2953 BUF_UNLOCK(bp); 2954 continue; 2955 } 2956 if (bp->b_flags & B_INVAL) { 2957 bremfreef(bp); 2958 brelse(bp); 2959 flushed++; 2960 continue; 2961 } 2962 2963 if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { 2964 if (flushdeps == 0) { 2965 BUF_UNLOCK(bp); 2966 continue; 2967 } 2968 hasdeps = 1; 2969 } else 2970 hasdeps = 0; 2971 /* 2972 * We must hold the lock on a vnode before writing 2973 * one of its buffers. Otherwise we may confuse, or 2974 * in the case of a snapshot vnode, deadlock the 2975 * system. 2976 * 2977 * The lock order here is the reverse of the normal 2978 * of vnode followed by buf lock. This is ok because 2979 * the NOWAIT will prevent deadlock. 2980 */ 2981 vp = bp->b_vp; 2982 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2983 BUF_UNLOCK(bp); 2984 continue; 2985 } 2986 if (lvp == NULL) { 2987 unlock = true; 2988 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); 2989 } else { 2990 ASSERT_VOP_LOCKED(vp, "getbuf"); 2991 unlock = false; 2992 error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : 2993 vn_lock(vp, LK_TRYUPGRADE); 2994 } 2995 if (error == 0) { 2996 CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", 2997 bp, bp->b_vp, bp->b_flags); 2998 if (curproc == bufdaemonproc) { 2999 vfs_bio_awrite(bp); 3000 } else { 3001 bremfree(bp); 3002 bwrite(bp); 3003 notbufdflushes++; 3004 } 3005 vn_finished_write(mp); 3006 if (unlock) 3007 VOP_UNLOCK(vp, 0); 3008 flushwithdeps += hasdeps; 3009 flushed++; 3010 3011 /* 3012 * Sleeping on runningbufspace while holding 3013 * vnode lock leads to deadlock. 3014 */ 3015 if (curproc == bufdaemonproc && 3016 runningbufspace > hirunningspace) 3017 waitrunningbufspace(); 3018 continue; 3019 } 3020 vn_finished_write(mp); 3021 BUF_UNLOCK(bp); 3022 } 3023 mtx_lock(&bqdirty); 3024 TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); 3025 mtx_unlock(&bqdirty); 3026 free(sentinel, M_TEMP); 3027 return (flushed); 3028 } 3029 3030 /* 3031 * Check to see if a block is currently memory resident. 3032 */ 3033 struct buf * 3034 incore(struct bufobj *bo, daddr_t blkno) 3035 { 3036 struct buf *bp; 3037 3038 BO_RLOCK(bo); 3039 bp = gbincore(bo, blkno); 3040 BO_RUNLOCK(bo); 3041 return (bp); 3042 } 3043 3044 /* 3045 * Returns true if no I/O is needed to access the 3046 * associated VM object. This is like incore except 3047 * it also hunts around in the VM system for the data. 3048 */ 3049 3050 static int 3051 inmem(struct vnode * vp, daddr_t blkno) 3052 { 3053 vm_object_t obj; 3054 vm_offset_t toff, tinc, size; 3055 vm_page_t m; 3056 vm_ooffset_t off; 3057 3058 ASSERT_VOP_LOCKED(vp, "inmem"); 3059 3060 if (incore(&vp->v_bufobj, blkno)) 3061 return 1; 3062 if (vp->v_mount == NULL) 3063 return 0; 3064 obj = vp->v_object; 3065 if (obj == NULL) 3066 return (0); 3067 3068 size = PAGE_SIZE; 3069 if (size > vp->v_mount->mnt_stat.f_iosize) 3070 size = vp->v_mount->mnt_stat.f_iosize; 3071 off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; 3072 3073 VM_OBJECT_RLOCK(obj); 3074 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 3075 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 3076 if (!m) 3077 goto notinmem; 3078 tinc = size; 3079 if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) 3080 tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); 3081 if (vm_page_is_valid(m, 3082 (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) 3083 goto notinmem; 3084 } 3085 VM_OBJECT_RUNLOCK(obj); 3086 return 1; 3087 3088 notinmem: 3089 VM_OBJECT_RUNLOCK(obj); 3090 return (0); 3091 } 3092 3093 /* 3094 * Set the dirty range for a buffer based on the status of the dirty 3095 * bits in the pages comprising the buffer. The range is limited 3096 * to the size of the buffer. 3097 * 3098 * Tell the VM system that the pages associated with this buffer 3099 * are clean. This is used for delayed writes where the data is 3100 * going to go to disk eventually without additional VM intevention. 3101 * 3102 * Note that while we only really need to clean through to b_bcount, we 3103 * just go ahead and clean through to b_bufsize. 3104 */ 3105 static void 3106 vfs_clean_pages_dirty_buf(struct buf *bp) 3107 { 3108 vm_ooffset_t foff, noff, eoff; 3109 vm_page_t m; 3110 int i; 3111 3112 if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) 3113 return; 3114 3115 foff = bp->b_offset; 3116 KASSERT(bp->b_offset != NOOFFSET, 3117 ("vfs_clean_pages_dirty_buf: no buffer offset")); 3118 3119 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 3120 vfs_drain_busy_pages(bp); 3121 vfs_setdirty_locked_object(bp); 3122 for (i = 0; i < bp->b_npages; i++) { 3123 noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3124 eoff = noff; 3125 if (eoff > bp->b_offset + bp->b_bufsize) 3126 eoff = bp->b_offset + bp->b_bufsize; 3127 m = bp->b_pages[i]; 3128 vfs_page_set_validclean(bp, foff, m); 3129 /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ 3130 foff = noff; 3131 } 3132 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 3133 } 3134 3135 static void 3136 vfs_setdirty_locked_object(struct buf *bp) 3137 { 3138 vm_object_t object; 3139 int i; 3140 3141 object = bp->b_bufobj->bo_object; 3142 VM_OBJECT_ASSERT_WLOCKED(object); 3143 3144 /* 3145 * We qualify the scan for modified pages on whether the 3146 * object has been flushed yet. 3147 */ 3148 if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { 3149 vm_offset_t boffset; 3150 vm_offset_t eoffset; 3151 3152 /* 3153 * test the pages to see if they have been modified directly 3154 * by users through the VM system. 3155 */ 3156 for (i = 0; i < bp->b_npages; i++) 3157 vm_page_test_dirty(bp->b_pages[i]); 3158 3159 /* 3160 * Calculate the encompassing dirty range, boffset and eoffset, 3161 * (eoffset - boffset) bytes. 3162 */ 3163 3164 for (i = 0; i < bp->b_npages; i++) { 3165 if (bp->b_pages[i]->dirty) 3166 break; 3167 } 3168 boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 3169 3170 for (i = bp->b_npages - 1; i >= 0; --i) { 3171 if (bp->b_pages[i]->dirty) { 3172 break; 3173 } 3174 } 3175 eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 3176 3177 /* 3178 * Fit it to the buffer. 3179 */ 3180 3181 if (eoffset > bp->b_bcount) 3182 eoffset = bp->b_bcount; 3183 3184 /* 3185 * If we have a good dirty range, merge with the existing 3186 * dirty range. 3187 */ 3188 3189 if (boffset < eoffset) { 3190 if (bp->b_dirtyoff > boffset) 3191 bp->b_dirtyoff = boffset; 3192 if (bp->b_dirtyend < eoffset) 3193 bp->b_dirtyend = eoffset; 3194 } 3195 } 3196 } 3197 3198 /* 3199 * Allocate the KVA mapping for an existing buffer. 3200 * If an unmapped buffer is provided but a mapped buffer is requested, take 3201 * also care to properly setup mappings between pages and KVA. 3202 */ 3203 static void 3204 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) 3205 { 3206 struct buf *scratch_bp; 3207 int bsize, maxsize, need_mapping, need_kva; 3208 off_t offset; 3209 3210 need_mapping = bp->b_data == unmapped_buf && 3211 (gbflags & GB_UNMAPPED) == 0; 3212 need_kva = bp->b_kvabase == unmapped_buf && 3213 bp->b_data == unmapped_buf && 3214 (gbflags & GB_KVAALLOC) != 0; 3215 if (!need_mapping && !need_kva) 3216 return; 3217 3218 BUF_CHECK_UNMAPPED(bp); 3219 3220 if (need_mapping && bp->b_kvabase != unmapped_buf) { 3221 /* 3222 * Buffer is not mapped, but the KVA was already 3223 * reserved at the time of the instantiation. Use the 3224 * allocated space. 3225 */ 3226 goto has_addr; 3227 } 3228 3229 /* 3230 * Calculate the amount of the address space we would reserve 3231 * if the buffer was mapped. 3232 */ 3233 bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; 3234 KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); 3235 offset = blkno * bsize; 3236 maxsize = size + (offset & PAGE_MASK); 3237 maxsize = imax(maxsize, bsize); 3238 3239 mapping_loop: 3240 if (bufkvaalloc(bp, maxsize, gbflags)) { 3241 /* 3242 * Request defragmentation. getnewbuf() returns us the 3243 * allocated space by the scratch buffer KVA. 3244 */ 3245 scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags | 3246 (GB_UNMAPPED | GB_KVAALLOC)); 3247 if (scratch_bp == NULL) { 3248 if ((gbflags & GB_NOWAIT_BD) != 0) { 3249 /* 3250 * XXXKIB: defragmentation cannot 3251 * succeed, not sure what else to do. 3252 */ 3253 panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); 3254 } 3255 atomic_add_int(&mappingrestarts, 1); 3256 goto mapping_loop; 3257 } 3258 KASSERT(scratch_bp->b_kvabase != unmapped_buf, 3259 ("scratch bp has no KVA %p", scratch_bp)); 3260 /* Grab pointers. */ 3261 bp->b_kvabase = scratch_bp->b_kvabase; 3262 bp->b_kvasize = scratch_bp->b_kvasize; 3263 bp->b_data = scratch_bp->b_data; 3264 3265 /* Get rid of the scratch buffer. */ 3266 scratch_bp->b_kvasize = 0; 3267 scratch_bp->b_flags |= B_INVAL; 3268 scratch_bp->b_data = scratch_bp->b_kvabase = unmapped_buf; 3269 brelse(scratch_bp); 3270 } 3271 has_addr: 3272 if (need_mapping) { 3273 /* b_offset is handled by bpmap_qenter. */ 3274 bp->b_data = bp->b_kvabase; 3275 BUF_CHECK_MAPPED(bp); 3276 bpmap_qenter(bp); 3277 } 3278 } 3279 3280 /* 3281 * getblk: 3282 * 3283 * Get a block given a specified block and offset into a file/device. 3284 * The buffers B_DONE bit will be cleared on return, making it almost 3285 * ready for an I/O initiation. B_INVAL may or may not be set on 3286 * return. The caller should clear B_INVAL prior to initiating a 3287 * READ. 3288 * 3289 * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 3290 * an existing buffer. 3291 * 3292 * For a VMIO buffer, B_CACHE is modified according to the backing VM. 3293 * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 3294 * and then cleared based on the backing VM. If the previous buffer is 3295 * non-0-sized but invalid, B_CACHE will be cleared. 3296 * 3297 * If getblk() must create a new buffer, the new buffer is returned with 3298 * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 3299 * case it is returned with B_INVAL clear and B_CACHE set based on the 3300 * backing VM. 3301 * 3302 * getblk() also forces a bwrite() for any B_DELWRI buffer whos 3303 * B_CACHE bit is clear. 3304 * 3305 * What this means, basically, is that the caller should use B_CACHE to 3306 * determine whether the buffer is fully valid or not and should clear 3307 * B_INVAL prior to issuing a read. If the caller intends to validate 3308 * the buffer by loading its data area with something, the caller needs 3309 * to clear B_INVAL. If the caller does this without issuing an I/O, 3310 * the caller should set B_CACHE ( as an optimization ), else the caller 3311 * should issue the I/O and biodone() will set B_CACHE if the I/O was 3312 * a write attempt or if it was a successfull read. If the caller 3313 * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR 3314 * prior to issuing the READ. biodone() will *not* clear B_INVAL. 3315 */ 3316 struct buf * 3317 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, 3318 int flags) 3319 { 3320 struct buf *bp; 3321 struct bufobj *bo; 3322 int bsize, error, maxsize, vmio; 3323 off_t offset; 3324 3325 CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); 3326 KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, 3327 ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); 3328 ASSERT_VOP_LOCKED(vp, "getblk"); 3329 if (size > MAXBCACHEBUF) 3330 panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size, 3331 MAXBCACHEBUF); 3332 if (!unmapped_buf_allowed) 3333 flags &= ~(GB_UNMAPPED | GB_KVAALLOC); 3334 3335 bo = &vp->v_bufobj; 3336 loop: 3337 BO_RLOCK(bo); 3338 bp = gbincore(bo, blkno); 3339 if (bp != NULL) { 3340 int lockflags; 3341 /* 3342 * Buffer is in-core. If the buffer is not busy nor managed, 3343 * it must be on a queue. 3344 */ 3345 lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; 3346 3347 if (flags & GB_LOCK_NOWAIT) 3348 lockflags |= LK_NOWAIT; 3349 3350 error = BUF_TIMELOCK(bp, lockflags, 3351 BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); 3352 3353 /* 3354 * If we slept and got the lock we have to restart in case 3355 * the buffer changed identities. 3356 */ 3357 if (error == ENOLCK) 3358 goto loop; 3359 /* We timed out or were interrupted. */ 3360 else if (error) 3361 return (NULL); 3362 /* If recursed, assume caller knows the rules. */ 3363 else if (BUF_LOCKRECURSED(bp)) 3364 goto end; 3365 3366 /* 3367 * The buffer is locked. B_CACHE is cleared if the buffer is 3368 * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set 3369 * and for a VMIO buffer B_CACHE is adjusted according to the 3370 * backing VM cache. 3371 */ 3372 if (bp->b_flags & B_INVAL) 3373 bp->b_flags &= ~B_CACHE; 3374 else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) 3375 bp->b_flags |= B_CACHE; 3376 if (bp->b_flags & B_MANAGED) 3377 MPASS(bp->b_qindex == QUEUE_NONE); 3378 else 3379 bremfree(bp); 3380 3381 /* 3382 * check for size inconsistencies for non-VMIO case. 3383 */ 3384 if (bp->b_bcount != size) { 3385 if ((bp->b_flags & B_VMIO) == 0 || 3386 (size > bp->b_kvasize)) { 3387 if (bp->b_flags & B_DELWRI) { 3388 /* 3389 * If buffer is pinned and caller does 3390 * not want sleep waiting for it to be 3391 * unpinned, bail out 3392 * */ 3393 if (bp->b_pin_count > 0) { 3394 if (flags & GB_LOCK_NOWAIT) { 3395 bqrelse(bp); 3396 return (NULL); 3397 } else { 3398 bunpin_wait(bp); 3399 } 3400 } 3401 bp->b_flags |= B_NOCACHE; 3402 bwrite(bp); 3403 } else { 3404 if (LIST_EMPTY(&bp->b_dep)) { 3405 bp->b_flags |= B_RELBUF; 3406 brelse(bp); 3407 } else { 3408 bp->b_flags |= B_NOCACHE; 3409 bwrite(bp); 3410 } 3411 } 3412 goto loop; 3413 } 3414 } 3415 3416 /* 3417 * Handle the case of unmapped buffer which should 3418 * become mapped, or the buffer for which KVA 3419 * reservation is requested. 3420 */ 3421 bp_unmapped_get_kva(bp, blkno, size, flags); 3422 3423 /* 3424 * If the size is inconsistant in the VMIO case, we can resize 3425 * the buffer. This might lead to B_CACHE getting set or 3426 * cleared. If the size has not changed, B_CACHE remains 3427 * unchanged from its previous state. 3428 */ 3429 if (bp->b_bcount != size) 3430 allocbuf(bp, size); 3431 3432 KASSERT(bp->b_offset != NOOFFSET, 3433 ("getblk: no buffer offset")); 3434 3435 /* 3436 * A buffer with B_DELWRI set and B_CACHE clear must 3437 * be committed before we can return the buffer in 3438 * order to prevent the caller from issuing a read 3439 * ( due to B_CACHE not being set ) and overwriting 3440 * it. 3441 * 3442 * Most callers, including NFS and FFS, need this to 3443 * operate properly either because they assume they 3444 * can issue a read if B_CACHE is not set, or because 3445 * ( for example ) an uncached B_DELWRI might loop due 3446 * to softupdates re-dirtying the buffer. In the latter 3447 * case, B_CACHE is set after the first write completes, 3448 * preventing further loops. 3449 * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE 3450 * above while extending the buffer, we cannot allow the 3451 * buffer to remain with B_CACHE set after the write 3452 * completes or it will represent a corrupt state. To 3453 * deal with this we set B_NOCACHE to scrap the buffer 3454 * after the write. 3455 * 3456 * We might be able to do something fancy, like setting 3457 * B_CACHE in bwrite() except if B_DELWRI is already set, 3458 * so the below call doesn't set B_CACHE, but that gets real 3459 * confusing. This is much easier. 3460 */ 3461 3462 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 3463 bp->b_flags |= B_NOCACHE; 3464 bwrite(bp); 3465 goto loop; 3466 } 3467 bp->b_flags &= ~B_DONE; 3468 } else { 3469 /* 3470 * Buffer is not in-core, create new buffer. The buffer 3471 * returned by getnewbuf() is locked. Note that the returned 3472 * buffer is also considered valid (not marked B_INVAL). 3473 */ 3474 BO_RUNLOCK(bo); 3475 /* 3476 * If the user does not want us to create the buffer, bail out 3477 * here. 3478 */ 3479 if (flags & GB_NOCREAT) 3480 return NULL; 3481 if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) 3482 return NULL; 3483 3484 bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; 3485 KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); 3486 offset = blkno * bsize; 3487 vmio = vp->v_object != NULL; 3488 if (vmio) { 3489 maxsize = size + (offset & PAGE_MASK); 3490 } else { 3491 maxsize = size; 3492 /* Do not allow non-VMIO notmapped buffers. */ 3493 flags &= ~(GB_UNMAPPED | GB_KVAALLOC); 3494 } 3495 maxsize = imax(maxsize, bsize); 3496 3497 bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags); 3498 if (bp == NULL) { 3499 if (slpflag || slptimeo) 3500 return NULL; 3501 goto loop; 3502 } 3503 3504 /* 3505 * This code is used to make sure that a buffer is not 3506 * created while the getnewbuf routine is blocked. 3507 * This can be a problem whether the vnode is locked or not. 3508 * If the buffer is created out from under us, we have to 3509 * throw away the one we just created. 3510 * 3511 * Note: this must occur before we associate the buffer 3512 * with the vp especially considering limitations in 3513 * the splay tree implementation when dealing with duplicate 3514 * lblkno's. 3515 */ 3516 BO_LOCK(bo); 3517 if (gbincore(bo, blkno)) { 3518 BO_UNLOCK(bo); 3519 bp->b_flags |= B_INVAL; 3520 brelse(bp); 3521 goto loop; 3522 } 3523 3524 /* 3525 * Insert the buffer into the hash, so that it can 3526 * be found by incore. 3527 */ 3528 bp->b_blkno = bp->b_lblkno = blkno; 3529 bp->b_offset = offset; 3530 bgetvp(vp, bp); 3531 BO_UNLOCK(bo); 3532 3533 /* 3534 * set B_VMIO bit. allocbuf() the buffer bigger. Since the 3535 * buffer size starts out as 0, B_CACHE will be set by 3536 * allocbuf() for the VMIO case prior to it testing the 3537 * backing store for validity. 3538 */ 3539 3540 if (vmio) { 3541 bp->b_flags |= B_VMIO; 3542 KASSERT(vp->v_object == bp->b_bufobj->bo_object, 3543 ("ARGH! different b_bufobj->bo_object %p %p %p\n", 3544 bp, vp->v_object, bp->b_bufobj->bo_object)); 3545 } else { 3546 bp->b_flags &= ~B_VMIO; 3547 KASSERT(bp->b_bufobj->bo_object == NULL, 3548 ("ARGH! has b_bufobj->bo_object %p %p\n", 3549 bp, bp->b_bufobj->bo_object)); 3550 BUF_CHECK_MAPPED(bp); 3551 } 3552 3553 allocbuf(bp, size); 3554 bp->b_flags &= ~B_DONE; 3555 } 3556 CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); 3557 BUF_ASSERT_HELD(bp); 3558 end: 3559 KASSERT(bp->b_bufobj == bo, 3560 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3561 return (bp); 3562 } 3563 3564 /* 3565 * Get an empty, disassociated buffer of given size. The buffer is initially 3566 * set to B_INVAL. 3567 */ 3568 struct buf * 3569 geteblk(int size, int flags) 3570 { 3571 struct buf *bp; 3572 int maxsize; 3573 3574 maxsize = (size + BKVAMASK) & ~BKVAMASK; 3575 while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) { 3576 if ((flags & GB_NOWAIT_BD) && 3577 (curthread->td_pflags & TDP_BUFNEED) != 0) 3578 return (NULL); 3579 } 3580 allocbuf(bp, size); 3581 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 3582 BUF_ASSERT_HELD(bp); 3583 return (bp); 3584 } 3585 3586 /* 3587 * Truncate the backing store for a non-vmio buffer. 3588 */ 3589 static void 3590 vfs_nonvmio_truncate(struct buf *bp, int newbsize) 3591 { 3592 3593 if (bp->b_flags & B_MALLOC) { 3594 /* 3595 * malloced buffers are not shrunk 3596 */ 3597 if (newbsize == 0) { 3598 bufmallocadjust(bp, 0); 3599 free(bp->b_data, M_BIOBUF); 3600 bp->b_data = bp->b_kvabase; 3601 bp->b_flags &= ~B_MALLOC; 3602 } 3603 return; 3604 } 3605 vm_hold_free_pages(bp, newbsize); 3606 bufspaceadjust(bp, newbsize); 3607 } 3608 3609 /* 3610 * Extend the backing for a non-VMIO buffer. 3611 */ 3612 static void 3613 vfs_nonvmio_extend(struct buf *bp, int newbsize) 3614 { 3615 caddr_t origbuf; 3616 int origbufsize; 3617 3618 /* 3619 * We only use malloced memory on the first allocation. 3620 * and revert to page-allocated memory when the buffer 3621 * grows. 3622 * 3623 * There is a potential smp race here that could lead 3624 * to bufmallocspace slightly passing the max. It 3625 * is probably extremely rare and not worth worrying 3626 * over. 3627 */ 3628 if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && 3629 bufmallocspace < maxbufmallocspace) { 3630 bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); 3631 bp->b_flags |= B_MALLOC; 3632 bufmallocadjust(bp, newbsize); 3633 return; 3634 } 3635 3636 /* 3637 * If the buffer is growing on its other-than-first 3638 * allocation then we revert to the page-allocation 3639 * scheme. 3640 */ 3641 origbuf = NULL; 3642 origbufsize = 0; 3643 if (bp->b_flags & B_MALLOC) { 3644 origbuf = bp->b_data; 3645 origbufsize = bp->b_bufsize; 3646 bp->b_data = bp->b_kvabase; 3647 bufmallocadjust(bp, 0); 3648 bp->b_flags &= ~B_MALLOC; 3649 newbsize = round_page(newbsize); 3650 } 3651 vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, 3652 (vm_offset_t) bp->b_data + newbsize); 3653 if (origbuf != NULL) { 3654 bcopy(origbuf, bp->b_data, origbufsize); 3655 free(origbuf, M_BIOBUF); 3656 } 3657 bufspaceadjust(bp, newbsize); 3658 } 3659 3660 /* 3661 * This code constitutes the buffer memory from either anonymous system 3662 * memory (in the case of non-VMIO operations) or from an associated 3663 * VM object (in the case of VMIO operations). This code is able to 3664 * resize a buffer up or down. 3665 * 3666 * Note that this code is tricky, and has many complications to resolve 3667 * deadlock or inconsistant data situations. Tread lightly!!! 3668 * There are B_CACHE and B_DELWRI interactions that must be dealt with by 3669 * the caller. Calling this code willy nilly can result in the loss of data. 3670 * 3671 * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 3672 * B_CACHE for the non-VMIO case. 3673 */ 3674 int 3675 allocbuf(struct buf *bp, int size) 3676 { 3677 int newbsize; 3678 3679 BUF_ASSERT_HELD(bp); 3680 3681 if (bp->b_kvasize != 0 && bp->b_kvasize < size) 3682 panic("allocbuf: buffer too small"); 3683 3684 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 3685 if ((bp->b_flags & B_VMIO) == 0) { 3686 if ((bp->b_flags & B_MALLOC) == 0) 3687 newbsize = round_page(newbsize); 3688 /* 3689 * Just get anonymous memory from the kernel. Don't 3690 * mess with B_CACHE. 3691 */ 3692 if (newbsize < bp->b_bufsize) 3693 vfs_nonvmio_truncate(bp, newbsize); 3694 else if (newbsize > bp->b_bufsize) 3695 vfs_nonvmio_extend(bp, newbsize); 3696 } else { 3697 int desiredpages; 3698 3699 desiredpages = (size == 0) ? 0 : 3700 num_pages((bp->b_offset & PAGE_MASK) + newbsize); 3701 3702 if (bp->b_flags & B_MALLOC) 3703 panic("allocbuf: VMIO buffer can't be malloced"); 3704 /* 3705 * Set B_CACHE initially if buffer is 0 length or will become 3706 * 0-length. 3707 */ 3708 if (size == 0 || bp->b_bufsize == 0) 3709 bp->b_flags |= B_CACHE; 3710 3711 if (newbsize < bp->b_bufsize) 3712 vfs_vmio_truncate(bp, desiredpages); 3713 /* XXX This looks as if it should be newbsize > b_bufsize */ 3714 else if (size > bp->b_bcount) 3715 vfs_vmio_extend(bp, desiredpages, size); 3716 bufspaceadjust(bp, newbsize); 3717 } 3718 bp->b_bcount = size; /* requested buffer size. */ 3719 return 1; 3720 } 3721 3722 extern int inflight_transient_maps; 3723 3724 void 3725 biodone(struct bio *bp) 3726 { 3727 struct mtx *mtxp; 3728 void (*done)(struct bio *); 3729 vm_offset_t start, end; 3730 3731 if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { 3732 bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; 3733 bp->bio_flags |= BIO_UNMAPPED; 3734 start = trunc_page((vm_offset_t)bp->bio_data); 3735 end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); 3736 bp->bio_data = unmapped_buf; 3737 pmap_qremove(start, OFF_TO_IDX(end - start)); 3738 vmem_free(transient_arena, start, end - start); 3739 atomic_add_int(&inflight_transient_maps, -1); 3740 } 3741 done = bp->bio_done; 3742 if (done == NULL) { 3743 mtxp = mtx_pool_find(mtxpool_sleep, bp); 3744 mtx_lock(mtxp); 3745 bp->bio_flags |= BIO_DONE; 3746 wakeup(bp); 3747 mtx_unlock(mtxp); 3748 } else { 3749 bp->bio_flags |= BIO_DONE; 3750 done(bp); 3751 } 3752 } 3753 3754 /* 3755 * Wait for a BIO to finish. 3756 */ 3757 int 3758 biowait(struct bio *bp, const char *wchan) 3759 { 3760 struct mtx *mtxp; 3761 3762 mtxp = mtx_pool_find(mtxpool_sleep, bp); 3763 mtx_lock(mtxp); 3764 while ((bp->bio_flags & BIO_DONE) == 0) 3765 msleep(bp, mtxp, PRIBIO, wchan, 0); 3766 mtx_unlock(mtxp); 3767 if (bp->bio_error != 0) 3768 return (bp->bio_error); 3769 if (!(bp->bio_flags & BIO_ERROR)) 3770 return (0); 3771 return (EIO); 3772 } 3773 3774 void 3775 biofinish(struct bio *bp, struct devstat *stat, int error) 3776 { 3777 3778 if (error) { 3779 bp->bio_error = error; 3780 bp->bio_flags |= BIO_ERROR; 3781 } 3782 if (stat != NULL) 3783 devstat_end_transaction_bio(stat, bp); 3784 biodone(bp); 3785 } 3786 3787 /* 3788 * bufwait: 3789 * 3790 * Wait for buffer I/O completion, returning error status. The buffer 3791 * is left locked and B_DONE on return. B_EINTR is converted into an EINTR 3792 * error and cleared. 3793 */ 3794 int 3795 bufwait(struct buf *bp) 3796 { 3797 if (bp->b_iocmd == BIO_READ) 3798 bwait(bp, PRIBIO, "biord"); 3799 else 3800 bwait(bp, PRIBIO, "biowr"); 3801 if (bp->b_flags & B_EINTR) { 3802 bp->b_flags &= ~B_EINTR; 3803 return (EINTR); 3804 } 3805 if (bp->b_ioflags & BIO_ERROR) { 3806 return (bp->b_error ? bp->b_error : EIO); 3807 } else { 3808 return (0); 3809 } 3810 } 3811 3812 /* 3813 * bufdone: 3814 * 3815 * Finish I/O on a buffer, optionally calling a completion function. 3816 * This is usually called from an interrupt so process blocking is 3817 * not allowed. 3818 * 3819 * biodone is also responsible for setting B_CACHE in a B_VMIO bp. 3820 * In a non-VMIO bp, B_CACHE will be set on the next getblk() 3821 * assuming B_INVAL is clear. 3822 * 3823 * For the VMIO case, we set B_CACHE if the op was a read and no 3824 * read error occured, or if the op was a write. B_CACHE is never 3825 * set if the buffer is invalid or otherwise uncacheable. 3826 * 3827 * biodone does not mess with B_INVAL, allowing the I/O routine or the 3828 * initiator to leave B_INVAL set to brelse the buffer out of existance 3829 * in the biodone routine. 3830 */ 3831 void 3832 bufdone(struct buf *bp) 3833 { 3834 struct bufobj *dropobj; 3835 void (*biodone)(struct buf *); 3836 3837 CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 3838 dropobj = NULL; 3839 3840 KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); 3841 BUF_ASSERT_HELD(bp); 3842 3843 runningbufwakeup(bp); 3844 if (bp->b_iocmd == BIO_WRITE) 3845 dropobj = bp->b_bufobj; 3846 /* call optional completion function if requested */ 3847 if (bp->b_iodone != NULL) { 3848 biodone = bp->b_iodone; 3849 bp->b_iodone = NULL; 3850 (*biodone) (bp); 3851 if (dropobj) 3852 bufobj_wdrop(dropobj); 3853 return; 3854 } 3855 3856 bufdone_finish(bp); 3857 3858 if (dropobj) 3859 bufobj_wdrop(dropobj); 3860 } 3861 3862 void 3863 bufdone_finish(struct buf *bp) 3864 { 3865 BUF_ASSERT_HELD(bp); 3866 3867 if (!LIST_EMPTY(&bp->b_dep)) 3868 buf_complete(bp); 3869 3870 if (bp->b_flags & B_VMIO) { 3871 /* 3872 * Set B_CACHE if the op was a normal read and no error 3873 * occured. B_CACHE is set for writes in the b*write() 3874 * routines. 3875 */ 3876 if (bp->b_iocmd == BIO_READ && 3877 !(bp->b_flags & (B_INVAL|B_NOCACHE)) && 3878 !(bp->b_ioflags & BIO_ERROR)) 3879 bp->b_flags |= B_CACHE; 3880 vfs_vmio_iodone(bp); 3881 } 3882 3883 /* 3884 * For asynchronous completions, release the buffer now. The brelse 3885 * will do a wakeup there if necessary - so no need to do a wakeup 3886 * here in the async case. The sync case always needs to do a wakeup. 3887 */ 3888 if (bp->b_flags & B_ASYNC) { 3889 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || 3890 (bp->b_ioflags & BIO_ERROR)) 3891 brelse(bp); 3892 else 3893 bqrelse(bp); 3894 } else 3895 bdone(bp); 3896 } 3897 3898 /* 3899 * This routine is called in lieu of iodone in the case of 3900 * incomplete I/O. This keeps the busy status for pages 3901 * consistant. 3902 */ 3903 void 3904 vfs_unbusy_pages(struct buf *bp) 3905 { 3906 int i; 3907 vm_object_t obj; 3908 vm_page_t m; 3909 3910 runningbufwakeup(bp); 3911 if (!(bp->b_flags & B_VMIO)) 3912 return; 3913 3914 obj = bp->b_bufobj->bo_object; 3915 VM_OBJECT_WLOCK(obj); 3916 for (i = 0; i < bp->b_npages; i++) { 3917 m = bp->b_pages[i]; 3918 if (m == bogus_page) { 3919 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); 3920 if (!m) 3921 panic("vfs_unbusy_pages: page missing\n"); 3922 bp->b_pages[i] = m; 3923 if (buf_mapped(bp)) { 3924 BUF_CHECK_MAPPED(bp); 3925 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 3926 bp->b_pages, bp->b_npages); 3927 } else 3928 BUF_CHECK_UNMAPPED(bp); 3929 } 3930 vm_object_pip_subtract(obj, 1); 3931 vm_page_sunbusy(m); 3932 } 3933 vm_object_pip_wakeupn(obj, 0); 3934 VM_OBJECT_WUNLOCK(obj); 3935 } 3936 3937 /* 3938 * vfs_page_set_valid: 3939 * 3940 * Set the valid bits in a page based on the supplied offset. The 3941 * range is restricted to the buffer's size. 3942 * 3943 * This routine is typically called after a read completes. 3944 */ 3945 static void 3946 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) 3947 { 3948 vm_ooffset_t eoff; 3949 3950 /* 3951 * Compute the end offset, eoff, such that [off, eoff) does not span a 3952 * page boundary and eoff is not greater than the end of the buffer. 3953 * The end of the buffer, in this case, is our file EOF, not the 3954 * allocation size of the buffer. 3955 */ 3956 eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; 3957 if (eoff > bp->b_offset + bp->b_bcount) 3958 eoff = bp->b_offset + bp->b_bcount; 3959 3960 /* 3961 * Set valid range. This is typically the entire buffer and thus the 3962 * entire page. 3963 */ 3964 if (eoff > off) 3965 vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); 3966 } 3967 3968 /* 3969 * vfs_page_set_validclean: 3970 * 3971 * Set the valid bits and clear the dirty bits in a page based on the 3972 * supplied offset. The range is restricted to the buffer's size. 3973 */ 3974 static void 3975 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) 3976 { 3977 vm_ooffset_t soff, eoff; 3978 3979 /* 3980 * Start and end offsets in buffer. eoff - soff may not cross a 3981 * page boundry or cross the end of the buffer. The end of the 3982 * buffer, in this case, is our file EOF, not the allocation size 3983 * of the buffer. 3984 */ 3985 soff = off; 3986 eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3987 if (eoff > bp->b_offset + bp->b_bcount) 3988 eoff = bp->b_offset + bp->b_bcount; 3989 3990 /* 3991 * Set valid range. This is typically the entire buffer and thus the 3992 * entire page. 3993 */ 3994 if (eoff > soff) { 3995 vm_page_set_validclean( 3996 m, 3997 (vm_offset_t) (soff & PAGE_MASK), 3998 (vm_offset_t) (eoff - soff) 3999 ); 4000 } 4001 } 4002 4003 /* 4004 * Ensure that all buffer pages are not exclusive busied. If any page is 4005 * exclusive busy, drain it. 4006 */ 4007 void 4008 vfs_drain_busy_pages(struct buf *bp) 4009 { 4010 vm_page_t m; 4011 int i, last_busied; 4012 4013 VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); 4014 last_busied = 0; 4015 for (i = 0; i < bp->b_npages; i++) { 4016 m = bp->b_pages[i]; 4017 if (vm_page_xbusied(m)) { 4018 for (; last_busied < i; last_busied++) 4019 vm_page_sbusy(bp->b_pages[last_busied]); 4020 while (vm_page_xbusied(m)) { 4021 vm_page_lock(m); 4022 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 4023 vm_page_busy_sleep(m, "vbpage"); 4024 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 4025 } 4026 } 4027 } 4028 for (i = 0; i < last_busied; i++) 4029 vm_page_sunbusy(bp->b_pages[i]); 4030 } 4031 4032 /* 4033 * This routine is called before a device strategy routine. 4034 * It is used to tell the VM system that paging I/O is in 4035 * progress, and treat the pages associated with the buffer 4036 * almost as being exclusive busy. Also the object paging_in_progress 4037 * flag is handled to make sure that the object doesn't become 4038 * inconsistant. 4039 * 4040 * Since I/O has not been initiated yet, certain buffer flags 4041 * such as BIO_ERROR or B_INVAL may be in an inconsistant state 4042 * and should be ignored. 4043 */ 4044 void 4045 vfs_busy_pages(struct buf *bp, int clear_modify) 4046 { 4047 int i, bogus; 4048 vm_object_t obj; 4049 vm_ooffset_t foff; 4050 vm_page_t m; 4051 4052 if (!(bp->b_flags & B_VMIO)) 4053 return; 4054 4055 obj = bp->b_bufobj->bo_object; 4056 foff = bp->b_offset; 4057 KASSERT(bp->b_offset != NOOFFSET, 4058 ("vfs_busy_pages: no buffer offset")); 4059 VM_OBJECT_WLOCK(obj); 4060 vfs_drain_busy_pages(bp); 4061 if (bp->b_bufsize != 0) 4062 vfs_setdirty_locked_object(bp); 4063 bogus = 0; 4064 for (i = 0; i < bp->b_npages; i++) { 4065 m = bp->b_pages[i]; 4066 4067 if ((bp->b_flags & B_CLUSTER) == 0) { 4068 vm_object_pip_add(obj, 1); 4069 vm_page_sbusy(m); 4070 } 4071 /* 4072 * When readying a buffer for a read ( i.e 4073 * clear_modify == 0 ), it is important to do 4074 * bogus_page replacement for valid pages in 4075 * partially instantiated buffers. Partially 4076 * instantiated buffers can, in turn, occur when 4077 * reconstituting a buffer from its VM backing store 4078 * base. We only have to do this if B_CACHE is 4079 * clear ( which causes the I/O to occur in the 4080 * first place ). The replacement prevents the read 4081 * I/O from overwriting potentially dirty VM-backed 4082 * pages. XXX bogus page replacement is, uh, bogus. 4083 * It may not work properly with small-block devices. 4084 * We need to find a better way. 4085 */ 4086 if (clear_modify) { 4087 pmap_remove_write(m); 4088 vfs_page_set_validclean(bp, foff, m); 4089 } else if (m->valid == VM_PAGE_BITS_ALL && 4090 (bp->b_flags & B_CACHE) == 0) { 4091 bp->b_pages[i] = bogus_page; 4092 bogus++; 4093 } 4094 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 4095 } 4096 VM_OBJECT_WUNLOCK(obj); 4097 if (bogus && buf_mapped(bp)) { 4098 BUF_CHECK_MAPPED(bp); 4099 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 4100 bp->b_pages, bp->b_npages); 4101 } 4102 } 4103 4104 /* 4105 * vfs_bio_set_valid: 4106 * 4107 * Set the range within the buffer to valid. The range is 4108 * relative to the beginning of the buffer, b_offset. Note that 4109 * b_offset itself may be offset from the beginning of the first 4110 * page. 4111 */ 4112 void 4113 vfs_bio_set_valid(struct buf *bp, int base, int size) 4114 { 4115 int i, n; 4116 vm_page_t m; 4117 4118 if (!(bp->b_flags & B_VMIO)) 4119 return; 4120 4121 /* 4122 * Fixup base to be relative to beginning of first page. 4123 * Set initial n to be the maximum number of bytes in the 4124 * first page that can be validated. 4125 */ 4126 base += (bp->b_offset & PAGE_MASK); 4127 n = PAGE_SIZE - (base & PAGE_MASK); 4128 4129 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 4130 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { 4131 m = bp->b_pages[i]; 4132 if (n > size) 4133 n = size; 4134 vm_page_set_valid_range(m, base & PAGE_MASK, n); 4135 base += n; 4136 size -= n; 4137 n = PAGE_SIZE; 4138 } 4139 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 4140 } 4141 4142 /* 4143 * vfs_bio_clrbuf: 4144 * 4145 * If the specified buffer is a non-VMIO buffer, clear the entire 4146 * buffer. If the specified buffer is a VMIO buffer, clear and 4147 * validate only the previously invalid portions of the buffer. 4148 * This routine essentially fakes an I/O, so we need to clear 4149 * BIO_ERROR and B_INVAL. 4150 * 4151 * Note that while we only theoretically need to clear through b_bcount, 4152 * we go ahead and clear through b_bufsize. 4153 */ 4154 void 4155 vfs_bio_clrbuf(struct buf *bp) 4156 { 4157 int i, j, mask, sa, ea, slide; 4158 4159 if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { 4160 clrbuf(bp); 4161 return; 4162 } 4163 bp->b_flags &= ~B_INVAL; 4164 bp->b_ioflags &= ~BIO_ERROR; 4165 VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); 4166 if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 4167 (bp->b_offset & PAGE_MASK) == 0) { 4168 if (bp->b_pages[0] == bogus_page) 4169 goto unlock; 4170 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 4171 VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); 4172 if ((bp->b_pages[0]->valid & mask) == mask) 4173 goto unlock; 4174 if ((bp->b_pages[0]->valid & mask) == 0) { 4175 pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); 4176 bp->b_pages[0]->valid |= mask; 4177 goto unlock; 4178 } 4179 } 4180 sa = bp->b_offset & PAGE_MASK; 4181 slide = 0; 4182 for (i = 0; i < bp->b_npages; i++, sa = 0) { 4183 slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); 4184 ea = slide & PAGE_MASK; 4185 if (ea == 0) 4186 ea = PAGE_SIZE; 4187 if (bp->b_pages[i] == bogus_page) 4188 continue; 4189 j = sa / DEV_BSIZE; 4190 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 4191 VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); 4192 if ((bp->b_pages[i]->valid & mask) == mask) 4193 continue; 4194 if ((bp->b_pages[i]->valid & mask) == 0) 4195 pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); 4196 else { 4197 for (; sa < ea; sa += DEV_BSIZE, j++) { 4198 if ((bp->b_pages[i]->valid & (1 << j)) == 0) { 4199 pmap_zero_page_area(bp->b_pages[i], 4200 sa, DEV_BSIZE); 4201 } 4202 } 4203 } 4204 bp->b_pages[i]->valid |= mask; 4205 } 4206 unlock: 4207 VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); 4208 bp->b_resid = 0; 4209 } 4210 4211 void 4212 vfs_bio_bzero_buf(struct buf *bp, int base, int size) 4213 { 4214 vm_page_t m; 4215 int i, n; 4216 4217 if (buf_mapped(bp)) { 4218 BUF_CHECK_MAPPED(bp); 4219 bzero(bp->b_data + base, size); 4220 } else { 4221 BUF_CHECK_UNMAPPED(bp); 4222 n = PAGE_SIZE - (base & PAGE_MASK); 4223 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { 4224 m = bp->b_pages[i]; 4225 if (n > size) 4226 n = size; 4227 pmap_zero_page_area(m, base & PAGE_MASK, n); 4228 base += n; 4229 size -= n; 4230 n = PAGE_SIZE; 4231 } 4232 } 4233 } 4234 4235 /* 4236 * vm_hold_load_pages and vm_hold_free_pages get pages into 4237 * a buffers address space. The pages are anonymous and are 4238 * not associated with a file object. 4239 */ 4240 static void 4241 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 4242 { 4243 vm_offset_t pg; 4244 vm_page_t p; 4245 int index; 4246 4247 BUF_CHECK_MAPPED(bp); 4248 4249 to = round_page(to); 4250 from = round_page(from); 4251 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4252 4253 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 4254 tryagain: 4255 /* 4256 * note: must allocate system pages since blocking here 4257 * could interfere with paging I/O, no matter which 4258 * process we are. 4259 */ 4260 p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | 4261 VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT)); 4262 if (p == NULL) { 4263 VM_WAIT; 4264 goto tryagain; 4265 } 4266 pmap_qenter(pg, &p, 1); 4267 bp->b_pages[index] = p; 4268 } 4269 bp->b_npages = index; 4270 } 4271 4272 /* Return pages associated with this buf to the vm system */ 4273 static void 4274 vm_hold_free_pages(struct buf *bp, int newbsize) 4275 { 4276 vm_offset_t from; 4277 vm_page_t p; 4278 int index, newnpages; 4279 4280 BUF_CHECK_MAPPED(bp); 4281 4282 from = round_page((vm_offset_t)bp->b_data + newbsize); 4283 newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4284 if (bp->b_npages > newnpages) 4285 pmap_qremove(from, bp->b_npages - newnpages); 4286 for (index = newnpages; index < bp->b_npages; index++) { 4287 p = bp->b_pages[index]; 4288 bp->b_pages[index] = NULL; 4289 if (vm_page_sbusied(p)) 4290 printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", 4291 (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); 4292 p->wire_count--; 4293 vm_page_free(p); 4294 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 4295 } 4296 bp->b_npages = newnpages; 4297 } 4298 4299 /* 4300 * Map an IO request into kernel virtual address space. 4301 * 4302 * All requests are (re)mapped into kernel VA space. 4303 * Notice that we use b_bufsize for the size of the buffer 4304 * to be mapped. b_bcount might be modified by the driver. 4305 * 4306 * Note that even if the caller determines that the address space should 4307 * be valid, a race or a smaller-file mapped into a larger space may 4308 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST 4309 * check the return value. 4310 * 4311 * This function only works with pager buffers. 4312 */ 4313 int 4314 vmapbuf(struct buf *bp, int mapbuf) 4315 { 4316 vm_prot_t prot; 4317 int pidx; 4318 4319 if (bp->b_bufsize < 0) 4320 return (-1); 4321 prot = VM_PROT_READ; 4322 if (bp->b_iocmd == BIO_READ) 4323 prot |= VM_PROT_WRITE; /* Less backwards than it looks */ 4324 if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, 4325 (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, 4326 btoc(MAXPHYS))) < 0) 4327 return (-1); 4328 bp->b_npages = pidx; 4329 bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; 4330 if (mapbuf || !unmapped_buf_allowed) { 4331 pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); 4332 bp->b_data = bp->b_kvabase + bp->b_offset; 4333 } else 4334 bp->b_data = unmapped_buf; 4335 return(0); 4336 } 4337 4338 /* 4339 * Free the io map PTEs associated with this IO operation. 4340 * We also invalidate the TLB entries and restore the original b_addr. 4341 * 4342 * This function only works with pager buffers. 4343 */ 4344 void 4345 vunmapbuf(struct buf *bp) 4346 { 4347 int npages; 4348 4349 npages = bp->b_npages; 4350 if (buf_mapped(bp)) 4351 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); 4352 vm_page_unhold_pages(bp->b_pages, npages); 4353 4354 bp->b_data = unmapped_buf; 4355 } 4356 4357 void 4358 bdone(struct buf *bp) 4359 { 4360 struct mtx *mtxp; 4361 4362 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4363 mtx_lock(mtxp); 4364 bp->b_flags |= B_DONE; 4365 wakeup(bp); 4366 mtx_unlock(mtxp); 4367 } 4368 4369 void 4370 bwait(struct buf *bp, u_char pri, const char *wchan) 4371 { 4372 struct mtx *mtxp; 4373 4374 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4375 mtx_lock(mtxp); 4376 while ((bp->b_flags & B_DONE) == 0) 4377 msleep(bp, mtxp, pri, wchan, 0); 4378 mtx_unlock(mtxp); 4379 } 4380 4381 int 4382 bufsync(struct bufobj *bo, int waitfor) 4383 { 4384 4385 return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread)); 4386 } 4387 4388 void 4389 bufstrategy(struct bufobj *bo, struct buf *bp) 4390 { 4391 int i = 0; 4392 struct vnode *vp; 4393 4394 vp = bp->b_vp; 4395 KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); 4396 KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, 4397 ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); 4398 i = VOP_STRATEGY(vp, bp); 4399 KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); 4400 } 4401 4402 void 4403 bufobj_wrefl(struct bufobj *bo) 4404 { 4405 4406 KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); 4407 ASSERT_BO_WLOCKED(bo); 4408 bo->bo_numoutput++; 4409 } 4410 4411 void 4412 bufobj_wref(struct bufobj *bo) 4413 { 4414 4415 KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); 4416 BO_LOCK(bo); 4417 bo->bo_numoutput++; 4418 BO_UNLOCK(bo); 4419 } 4420 4421 void 4422 bufobj_wdrop(struct bufobj *bo) 4423 { 4424 4425 KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); 4426 BO_LOCK(bo); 4427 KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); 4428 if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { 4429 bo->bo_flag &= ~BO_WWAIT; 4430 wakeup(&bo->bo_numoutput); 4431 } 4432 BO_UNLOCK(bo); 4433 } 4434 4435 int 4436 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) 4437 { 4438 int error; 4439 4440 KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); 4441 ASSERT_BO_WLOCKED(bo); 4442 error = 0; 4443 while (bo->bo_numoutput) { 4444 bo->bo_flag |= BO_WWAIT; 4445 error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), 4446 slpflag | (PRIBIO + 1), "bo_wwait", timeo); 4447 if (error) 4448 break; 4449 } 4450 return (error); 4451 } 4452 4453 void 4454 bpin(struct buf *bp) 4455 { 4456 struct mtx *mtxp; 4457 4458 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4459 mtx_lock(mtxp); 4460 bp->b_pin_count++; 4461 mtx_unlock(mtxp); 4462 } 4463 4464 void 4465 bunpin(struct buf *bp) 4466 { 4467 struct mtx *mtxp; 4468 4469 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4470 mtx_lock(mtxp); 4471 if (--bp->b_pin_count == 0) 4472 wakeup(bp); 4473 mtx_unlock(mtxp); 4474 } 4475 4476 void 4477 bunpin_wait(struct buf *bp) 4478 { 4479 struct mtx *mtxp; 4480 4481 mtxp = mtx_pool_find(mtxpool_sleep, bp); 4482 mtx_lock(mtxp); 4483 while (bp->b_pin_count > 0) 4484 msleep(bp, mtxp, PRIBIO, "bwunpin", 0); 4485 mtx_unlock(mtxp); 4486 } 4487 4488 /* 4489 * Set bio_data or bio_ma for struct bio from the struct buf. 4490 */ 4491 void 4492 bdata2bio(struct buf *bp, struct bio *bip) 4493 { 4494 4495 if (!buf_mapped(bp)) { 4496 KASSERT(unmapped_buf_allowed, ("unmapped")); 4497 bip->bio_ma = bp->b_pages; 4498 bip->bio_ma_n = bp->b_npages; 4499 bip->bio_data = unmapped_buf; 4500 bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; 4501 bip->bio_flags |= BIO_UNMAPPED; 4502 KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / 4503 PAGE_SIZE == bp->b_npages, 4504 ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, 4505 (long long)bip->bio_length, bip->bio_ma_n)); 4506 } else { 4507 bip->bio_data = bp->b_data; 4508 bip->bio_ma = NULL; 4509 } 4510 } 4511 4512 #include "opt_ddb.h" 4513 #ifdef DDB 4514 #include <ddb/ddb.h> 4515 4516 /* DDB command to show buffer data */ 4517 DB_SHOW_COMMAND(buffer, db_show_buffer) 4518 { 4519 /* get args */ 4520 struct buf *bp = (struct buf *)addr; 4521 4522 if (!have_addr) { 4523 db_printf("usage: show buffer <addr>\n"); 4524 return; 4525 } 4526 4527 db_printf("buf at %p\n", bp); 4528 db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", 4529 (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, 4530 PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); 4531 db_printf( 4532 "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" 4533 "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " 4534 "b_dep = %p\n", 4535 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 4536 bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, 4537 (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); 4538 db_printf("b_kvabase = %p, b_kvasize = %d\n", 4539 bp->b_kvabase, bp->b_kvasize); 4540 if (bp->b_npages) { 4541 int i; 4542 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); 4543 for (i = 0; i < bp->b_npages; i++) { 4544 vm_page_t m; 4545 m = bp->b_pages[i]; 4546 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, 4547 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); 4548 if ((i + 1) < bp->b_npages) 4549 db_printf(","); 4550 } 4551 db_printf("\n"); 4552 } 4553 db_printf(" "); 4554 BUF_LOCKPRINTINFO(bp); 4555 } 4556 4557 DB_SHOW_COMMAND(lockedbufs, lockedbufs) 4558 { 4559 struct buf *bp; 4560 int i; 4561 4562 for (i = 0; i < nbuf; i++) { 4563 bp = &buf[i]; 4564 if (BUF_ISLOCKED(bp)) { 4565 db_show_buffer((uintptr_t)bp, 1, 0, NULL); 4566 db_printf("\n"); 4567 } 4568 } 4569 } 4570 4571 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) 4572 { 4573 struct vnode *vp; 4574 struct buf *bp; 4575 4576 if (!have_addr) { 4577 db_printf("usage: show vnodebufs <addr>\n"); 4578 return; 4579 } 4580 vp = (struct vnode *)addr; 4581 db_printf("Clean buffers:\n"); 4582 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { 4583 db_show_buffer((uintptr_t)bp, 1, 0, NULL); 4584 db_printf("\n"); 4585 } 4586 db_printf("Dirty buffers:\n"); 4587 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { 4588 db_show_buffer((uintptr_t)bp, 1, 0, NULL); 4589 db_printf("\n"); 4590 } 4591 } 4592 4593 DB_COMMAND(countfreebufs, db_coundfreebufs) 4594 { 4595 struct buf *bp; 4596 int i, used = 0, nfree = 0; 4597 4598 if (have_addr) { 4599 db_printf("usage: countfreebufs\n"); 4600 return; 4601 } 4602 4603 for (i = 0; i < nbuf; i++) { 4604 bp = &buf[i]; 4605 if ((bp->b_flags & B_INFREECNT) != 0) 4606 nfree++; 4607 else 4608 used++; 4609 } 4610 4611 db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, 4612 nfree + used); 4613 db_printf("numfreebuffers is %d\n", numfreebuffers); 4614 } 4615 #endif /* DDB */ 4616