1 /* 2 * Copyright (c) 1994,1997 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Absolutely no warranty of function or purpose is made by the author 12 * John S. Dyson. 13 * 14 * $Id: vfs_bio.c,v 1.225 1999/08/08 18:42:48 phk Exp $ 15 */ 16 17 /* 18 * this file contains a new buffer I/O scheme implementing a coherent 19 * VM object and buffer cache scheme. Pains have been taken to make 20 * sure that the performance degradation associated with schemes such 21 * as this is not realized. 22 * 23 * Author: John S. Dyson 24 * Significant help during the development and debugging phases 25 * had been provided by David Greenman, also of the FreeBSD core team. 26 * 27 * see man buf(9) for more info. 28 */ 29 30 #define VMIO 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/sysproto.h> 34 #include <sys/kernel.h> 35 #include <sys/sysctl.h> 36 #include <sys/proc.h> 37 #include <sys/kthread.h> 38 #include <sys/vnode.h> 39 #include <sys/vmmeter.h> 40 #include <sys/lock.h> 41 #include <vm/vm.h> 42 #include <vm/vm_param.h> 43 #include <vm/vm_prot.h> 44 #include <vm/vm_kern.h> 45 #include <vm/vm_pageout.h> 46 #include <vm/vm_page.h> 47 #include <vm/vm_object.h> 48 #include <vm/vm_extern.h> 49 #include <vm/vm_map.h> 50 #include <sys/buf.h> 51 #include <sys/mount.h> 52 #include <sys/malloc.h> 53 #include <sys/resourcevar.h> 54 #include <sys/conf.h> 55 56 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); 57 58 struct bio_ops bioops; /* I/O operation notification */ 59 60 struct buf *buf; /* buffer header pool */ 61 struct swqueue bswlist; 62 63 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, 64 vm_offset_t to); 65 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, 66 vm_offset_t to); 67 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, 68 int pageno, vm_page_t m); 69 static void vfs_clean_pages(struct buf * bp); 70 static void vfs_setdirty(struct buf *bp); 71 static void vfs_vmio_release(struct buf *bp); 72 static int flushbufqueues(void); 73 74 static int bd_request; 75 76 static void buf_daemon __P((void)); 77 /* 78 * bogus page -- for I/O to/from partially complete buffers 79 * this is a temporary solution to the problem, but it is not 80 * really that bad. it would be better to split the buffer 81 * for input in the case of buffers partially already in memory, 82 * but the code is intricate enough already. 83 */ 84 vm_page_t bogus_page; 85 int runningbufspace; 86 int vmiodirenable = FALSE; 87 static vm_offset_t bogus_offset; 88 89 static int bufspace, maxbufspace, vmiospace, 90 bufmallocspace, maxbufmallocspace, hibufspace; 91 #if 0 92 static int maxvmiobufspace; 93 #endif 94 static int maxbdrun; 95 static int needsbuffer; 96 static int numdirtybuffers, lodirtybuffers, hidirtybuffers; 97 static int numfreebuffers, lofreebuffers, hifreebuffers; 98 static int getnewbufcalls; 99 static int getnewbufrestarts; 100 static int kvafreespace; 101 102 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, 103 &numdirtybuffers, 0, ""); 104 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, 105 &lodirtybuffers, 0, ""); 106 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, 107 &hidirtybuffers, 0, ""); 108 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, 109 &numfreebuffers, 0, ""); 110 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, 111 &lofreebuffers, 0, ""); 112 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, 113 &hifreebuffers, 0, ""); 114 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, 115 &runningbufspace, 0, ""); 116 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, 117 &maxbufspace, 0, ""); 118 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, 119 &hibufspace, 0, ""); 120 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, 121 &bufspace, 0, ""); 122 SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW, 123 &maxbdrun, 0, ""); 124 #if 0 125 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW, 126 &maxvmiobufspace, 0, ""); 127 #endif 128 SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD, 129 &vmiospace, 0, ""); 130 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, 131 &maxbufmallocspace, 0, ""); 132 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, 133 &bufmallocspace, 0, ""); 134 SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD, 135 &kvafreespace, 0, ""); 136 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, 137 &getnewbufcalls, 0, ""); 138 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, 139 &getnewbufrestarts, 0, ""); 140 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, 141 &vmiodirenable, 0, ""); 142 143 144 static int bufhashmask; 145 static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; 146 struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } }; 147 char *buf_wmesg = BUF_WMESG; 148 149 extern int vm_swap_size; 150 151 #define BUF_MAXUSE 24 152 153 #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ 154 #define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ 155 #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ 156 #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ 157 #define VFS_BIO_NEED_KVASPACE 0x10 /* wait for buffer_map space, emerg */ 158 159 /* 160 * Buffer hash table code. Note that the logical block scans linearly, which 161 * gives us some L1 cache locality. 162 */ 163 164 static __inline 165 struct bufhashhdr * 166 bufhash(struct vnode *vnp, daddr_t bn) 167 { 168 return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]); 169 } 170 171 /* 172 * kvaspacewakeup: 173 * 174 * Called when kva space is potential available for recovery or when 175 * kva space is recovered in the buffer_map. This function wakes up 176 * anyone waiting for buffer_map kva space. Even though the buffer_map 177 * is larger then maxbufspace, this situation will typically occur 178 * when the buffer_map gets fragmented. 179 */ 180 181 static __inline void 182 kvaspacewakeup(void) 183 { 184 /* 185 * If someone is waiting for KVA space, wake them up. Even 186 * though we haven't freed the kva space yet, the waiting 187 * process will be able to now. 188 */ 189 if (needsbuffer & VFS_BIO_NEED_KVASPACE) { 190 needsbuffer &= ~VFS_BIO_NEED_KVASPACE; 191 wakeup(&needsbuffer); 192 } 193 } 194 195 /* 196 * numdirtywakeup: 197 * 198 * If someone is blocked due to there being too many dirty buffers, 199 * and numdirtybuffers is now reasonable, wake them up. 200 */ 201 202 static __inline void 203 numdirtywakeup(void) 204 { 205 if (numdirtybuffers < hidirtybuffers) { 206 if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { 207 needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; 208 wakeup(&needsbuffer); 209 } 210 } 211 } 212 213 /* 214 * bufspacewakeup: 215 * 216 * Called when buffer space is potentially available for recovery or when 217 * buffer space is recovered. getnewbuf() will block on this flag when 218 * it is unable to free sufficient buffer space. Buffer space becomes 219 * recoverable when bp's get placed back in the queues. 220 */ 221 222 static __inline void 223 bufspacewakeup(void) 224 { 225 /* 226 * If someone is waiting for BUF space, wake them up. Even 227 * though we haven't freed the kva space yet, the waiting 228 * process will be able to now. 229 */ 230 if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { 231 needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; 232 wakeup(&needsbuffer); 233 } 234 } 235 236 /* 237 * bufcountwakeup: 238 * 239 * Called when a buffer has been added to one of the free queues to 240 * account for the buffer and to wakeup anyone waiting for free buffers. 241 * This typically occurs when large amounts of metadata are being handled 242 * by the buffer cache ( else buffer space runs out first, usually ). 243 */ 244 245 static __inline void 246 bufcountwakeup(void) 247 { 248 ++numfreebuffers; 249 if (needsbuffer) { 250 needsbuffer &= ~VFS_BIO_NEED_ANY; 251 if (numfreebuffers >= hifreebuffers) 252 needsbuffer &= ~VFS_BIO_NEED_FREE; 253 wakeup(&needsbuffer); 254 } 255 } 256 257 /* 258 * vfs_buf_test_cache: 259 * 260 * Called when a buffer is extended. This function clears the B_CACHE 261 * bit if the newly extended portion of the buffer does not contain 262 * valid data. 263 */ 264 static __inline__ 265 void 266 vfs_buf_test_cache(struct buf *bp, 267 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 268 vm_page_t m) 269 { 270 if (bp->b_flags & B_CACHE) { 271 int base = (foff + off) & PAGE_MASK; 272 if (vm_page_is_valid(m, base, size) == 0) 273 bp->b_flags &= ~B_CACHE; 274 } 275 } 276 277 static __inline__ 278 void 279 bd_wakeup(int dirtybuflevel) 280 { 281 if (numdirtybuffers >= dirtybuflevel && bd_request == 0) { 282 bd_request = 1; 283 wakeup(&bd_request); 284 } 285 } 286 287 288 /* 289 * Initialize buffer headers and related structures. 290 */ 291 292 caddr_t 293 bufhashinit(caddr_t vaddr) 294 { 295 /* first, make a null hash table */ 296 for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1) 297 ; 298 bufhashtbl = (void *)vaddr; 299 vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask; 300 --bufhashmask; 301 return(vaddr); 302 } 303 304 void 305 bufinit(void) 306 { 307 struct buf *bp; 308 int i; 309 310 TAILQ_INIT(&bswlist); 311 LIST_INIT(&invalhash); 312 simple_lock_init(&buftimelock); 313 314 for (i = 0; i <= bufhashmask; i++) 315 LIST_INIT(&bufhashtbl[i]); 316 317 /* next, make a null set of free lists */ 318 for (i = 0; i < BUFFER_QUEUES; i++) 319 TAILQ_INIT(&bufqueues[i]); 320 321 /* finally, initialize each buffer header and stick on empty q */ 322 for (i = 0; i < nbuf; i++) { 323 bp = &buf[i]; 324 bzero(bp, sizeof *bp); 325 bp->b_flags = B_INVAL; /* we're just an empty header */ 326 bp->b_dev = NODEV; 327 bp->b_rcred = NOCRED; 328 bp->b_wcred = NOCRED; 329 bp->b_qindex = QUEUE_EMPTY; 330 bp->b_xflags = 0; 331 LIST_INIT(&bp->b_dep); 332 BUF_LOCKINIT(bp); 333 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 334 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 335 } 336 337 /* 338 * maxbufspace is currently calculated to support all filesystem 339 * blocks to be 8K. If you happen to use a 16K filesystem, the size 340 * of the buffer cache is still the same as it would be for 8K 341 * filesystems. This keeps the size of the buffer cache "in check" 342 * for big block filesystems. 343 * 344 * maxbufspace is calculated as around 50% of the KVA available in 345 * the buffer_map ( DFLTSIZE vs BKVASIZE ), I presume to reduce the 346 * effect of fragmentation. 347 */ 348 maxbufspace = (nbuf + 8) * DFLTBSIZE; 349 if ((hibufspace = maxbufspace - MAXBSIZE * 5) <= MAXBSIZE) 350 hibufspace = 3 * maxbufspace / 4; 351 #if 0 352 /* 353 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed 354 */ 355 maxvmiobufspace = 2 * hibufspace / 3; 356 #endif 357 /* 358 * Limit the amount of malloc memory since it is wired permanently into 359 * the kernel space. Even though this is accounted for in the buffer 360 * allocation, we don't want the malloced region to grow uncontrolled. 361 * The malloc scheme improves memory utilization significantly on average 362 * (small) directories. 363 */ 364 maxbufmallocspace = hibufspace / 20; 365 366 /* 367 * Reduce the chance of a deadlock occuring by limiting the number 368 * of delayed-write dirty buffers we allow to stack up. 369 */ 370 lodirtybuffers = nbuf / 7 + 10; 371 hidirtybuffers = nbuf / 4 + 20; 372 numdirtybuffers = 0; 373 374 /* 375 * Try to keep the number of free buffers in the specified range, 376 * and give the syncer access to an emergency reserve. 377 */ 378 lofreebuffers = nbuf / 18 + 5; 379 hifreebuffers = 2 * lofreebuffers; 380 numfreebuffers = nbuf; 381 382 /* 383 * Maximum number of async ops initiated per buf_daemon loop. This is 384 * somewhat of a hack at the moment, we really need to limit ourselves 385 * based on the number of bytes of I/O in-transit that were initiated 386 * from buf_daemon. 387 */ 388 if ((maxbdrun = nswbuf / 4) < 4) 389 maxbdrun = 4; 390 391 kvafreespace = 0; 392 393 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 394 bogus_page = vm_page_alloc(kernel_object, 395 ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 396 VM_ALLOC_NORMAL); 397 398 } 399 400 /* 401 * Free the kva allocation for a buffer 402 * Must be called only at splbio or higher, 403 * as this is the only locking for buffer_map. 404 */ 405 static void 406 bfreekva(struct buf * bp) 407 { 408 if (bp->b_kvasize) { 409 vm_map_delete(buffer_map, 410 (vm_offset_t) bp->b_kvabase, 411 (vm_offset_t) bp->b_kvabase + bp->b_kvasize 412 ); 413 bp->b_kvasize = 0; 414 kvaspacewakeup(); 415 } 416 } 417 418 /* 419 * bremfree: 420 * 421 * Remove the buffer from the appropriate free list. 422 */ 423 void 424 bremfree(struct buf * bp) 425 { 426 int s = splbio(); 427 int old_qindex = bp->b_qindex; 428 429 if (bp->b_qindex != QUEUE_NONE) { 430 if (bp->b_qindex == QUEUE_EMPTYKVA) { 431 kvafreespace -= bp->b_kvasize; 432 } 433 KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); 434 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 435 bp->b_qindex = QUEUE_NONE; 436 runningbufspace += bp->b_bufsize; 437 } else { 438 #if !defined(MAX_PERF) 439 if (BUF_REFCNT(bp) <= 1) 440 panic("bremfree: removing a buffer not on a queue"); 441 #endif 442 } 443 444 /* 445 * Fixup numfreebuffers count. If the buffer is invalid or not 446 * delayed-write, and it was on the EMPTY, LRU, or AGE queues, 447 * the buffer was free and we must decrement numfreebuffers. 448 */ 449 if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { 450 switch(old_qindex) { 451 case QUEUE_DIRTY: 452 case QUEUE_CLEAN: 453 case QUEUE_EMPTY: 454 case QUEUE_EMPTYKVA: 455 --numfreebuffers; 456 break; 457 default: 458 break; 459 } 460 } 461 splx(s); 462 } 463 464 465 /* 466 * Get a buffer with the specified data. Look in the cache first. We 467 * must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE 468 * is set, the buffer is valid and we do not have to do anything ( see 469 * getblk() ). 470 */ 471 int 472 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, 473 struct buf ** bpp) 474 { 475 struct buf *bp; 476 477 bp = getblk(vp, blkno, size, 0, 0); 478 *bpp = bp; 479 480 /* if not found in cache, do some I/O */ 481 if ((bp->b_flags & B_CACHE) == 0) { 482 if (curproc != NULL) 483 curproc->p_stats->p_ru.ru_inblock++; 484 KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp)); 485 bp->b_flags |= B_READ; 486 bp->b_flags &= ~(B_ERROR | B_INVAL); 487 if (bp->b_rcred == NOCRED) { 488 if (cred != NOCRED) 489 crhold(cred); 490 bp->b_rcred = cred; 491 } 492 vfs_busy_pages(bp, 0); 493 VOP_STRATEGY(vp, bp); 494 return (biowait(bp)); 495 } 496 return (0); 497 } 498 499 /* 500 * Operates like bread, but also starts asynchronous I/O on 501 * read-ahead blocks. We must clear B_ERROR and B_INVAL prior 502 * to initiating I/O . If B_CACHE is set, the buffer is valid 503 * and we do not have to do anything. 504 */ 505 int 506 breadn(struct vnode * vp, daddr_t blkno, int size, 507 daddr_t * rablkno, int *rabsize, 508 int cnt, struct ucred * cred, struct buf ** bpp) 509 { 510 struct buf *bp, *rabp; 511 int i; 512 int rv = 0, readwait = 0; 513 514 *bpp = bp = getblk(vp, blkno, size, 0, 0); 515 516 /* if not found in cache, do some I/O */ 517 if ((bp->b_flags & B_CACHE) == 0) { 518 if (curproc != NULL) 519 curproc->p_stats->p_ru.ru_inblock++; 520 bp->b_flags |= B_READ; 521 bp->b_flags &= ~(B_ERROR | B_INVAL); 522 if (bp->b_rcred == NOCRED) { 523 if (cred != NOCRED) 524 crhold(cred); 525 bp->b_rcred = cred; 526 } 527 vfs_busy_pages(bp, 0); 528 VOP_STRATEGY(vp, bp); 529 ++readwait; 530 } 531 532 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 533 if (inmem(vp, *rablkno)) 534 continue; 535 rabp = getblk(vp, *rablkno, *rabsize, 0, 0); 536 537 if ((rabp->b_flags & B_CACHE) == 0) { 538 if (curproc != NULL) 539 curproc->p_stats->p_ru.ru_inblock++; 540 rabp->b_flags |= B_READ | B_ASYNC; 541 rabp->b_flags &= ~(B_ERROR | B_INVAL); 542 if (rabp->b_rcred == NOCRED) { 543 if (cred != NOCRED) 544 crhold(cred); 545 rabp->b_rcred = cred; 546 } 547 vfs_busy_pages(rabp, 0); 548 BUF_KERNPROC(rabp); 549 VOP_STRATEGY(vp, rabp); 550 } else { 551 brelse(rabp); 552 } 553 } 554 555 if (readwait) { 556 rv = biowait(bp); 557 } 558 return (rv); 559 } 560 561 /* 562 * Write, release buffer on completion. (Done by iodone 563 * if async). Do not bother writing anything if the buffer 564 * is invalid. 565 * 566 * Note that we set B_CACHE here, indicating that buffer is 567 * fully valid and thus cacheable. This is true even of NFS 568 * now so we set it generally. This could be set either here 569 * or in biodone() since the I/O is synchronous. We put it 570 * here. 571 */ 572 int 573 bwrite(struct buf * bp) 574 { 575 int oldflags, s; 576 struct vnode *vp; 577 struct mount *mp; 578 579 if (bp->b_flags & B_INVAL) { 580 brelse(bp); 581 return (0); 582 } 583 584 oldflags = bp->b_flags; 585 586 #if !defined(MAX_PERF) 587 if (BUF_REFCNT(bp) == 0) 588 panic("bwrite: buffer is not busy???"); 589 #endif 590 s = splbio(); 591 bundirty(bp); 592 593 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR); 594 bp->b_flags |= B_WRITEINPROG | B_CACHE; 595 596 bp->b_vp->v_numoutput++; 597 vfs_busy_pages(bp, 1); 598 if (curproc != NULL) 599 curproc->p_stats->p_ru.ru_oublock++; 600 splx(s); 601 if (oldflags & B_ASYNC) 602 BUF_KERNPROC(bp); 603 VOP_STRATEGY(bp->b_vp, bp); 604 605 /* 606 * Collect statistics on synchronous and asynchronous writes. 607 * Writes to block devices are charged to their associated 608 * filesystem (if any). 609 */ 610 if ((vp = bp->b_vp) != NULL) { 611 if (vp->v_type == VBLK) 612 mp = vp->v_specmountpoint; 613 else 614 mp = vp->v_mount; 615 if (mp != NULL) { 616 if ((oldflags & B_ASYNC) == 0) 617 mp->mnt_stat.f_syncwrites++; 618 else 619 mp->mnt_stat.f_asyncwrites++; 620 } 621 } 622 623 if ((oldflags & B_ASYNC) == 0) { 624 int rtval = biowait(bp); 625 brelse(bp); 626 return (rtval); 627 } 628 629 return (0); 630 } 631 632 /* 633 * Delayed write. (Buffer is marked dirty). Do not bother writing 634 * anything if the buffer is marked invalid. 635 * 636 * Note that since the buffer must be completely valid, we can safely 637 * set B_CACHE. In fact, we have to set B_CACHE here rather then in 638 * biodone() in order to prevent getblk from writing the buffer 639 * out synchronously. 640 */ 641 void 642 bdwrite(struct buf * bp) 643 { 644 #if 0 645 struct vnode *vp; 646 #endif 647 648 #if !defined(MAX_PERF) 649 if (BUF_REFCNT(bp) == 0) 650 panic("bdwrite: buffer is not busy"); 651 #endif 652 653 if (bp->b_flags & B_INVAL) { 654 brelse(bp); 655 return; 656 } 657 bdirty(bp); 658 659 /* 660 * Set B_CACHE, indicating that the buffer is fully valid. This is 661 * true even of NFS now. 662 */ 663 bp->b_flags |= B_CACHE; 664 665 /* 666 * This bmap keeps the system from needing to do the bmap later, 667 * perhaps when the system is attempting to do a sync. Since it 668 * is likely that the indirect block -- or whatever other datastructure 669 * that the filesystem needs is still in memory now, it is a good 670 * thing to do this. Note also, that if the pageout daemon is 671 * requesting a sync -- there might not be enough memory to do 672 * the bmap then... So, this is important to do. 673 */ 674 if (bp->b_lblkno == bp->b_blkno) { 675 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 676 } 677 678 /* 679 * Set the *dirty* buffer range based upon the VM system dirty pages. 680 */ 681 vfs_setdirty(bp); 682 683 /* 684 * We need to do this here to satisfy the vnode_pager and the 685 * pageout daemon, so that it thinks that the pages have been 686 * "cleaned". Note that since the pages are in a delayed write 687 * buffer -- the VFS layer "will" see that the pages get written 688 * out on the next sync, or perhaps the cluster will be completed. 689 */ 690 vfs_clean_pages(bp); 691 bqrelse(bp); 692 693 /* 694 * Wakeup the buffer flushing daemon if we have saturated the 695 * buffer cache. 696 */ 697 698 bd_wakeup(hidirtybuffers); 699 700 /* 701 * note: we cannot initiate I/O from a bdwrite even if we wanted to, 702 * due to the softdep code. 703 */ 704 #if 0 705 /* 706 * XXX The soft dependency code is not prepared to 707 * have I/O done when a bdwrite is requested. For 708 * now we just let the write be delayed if it is 709 * requested by the soft dependency code. 710 */ 711 if ((vp = bp->b_vp) && 712 ((vp->v_type == VBLK && vp->v_specmountpoint && 713 (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) || 714 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)))) 715 return; 716 #endif 717 } 718 719 /* 720 * bdirty: 721 * 722 * Turn buffer into delayed write request. We must clear B_READ and 723 * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to 724 * itself to properly update it in the dirty/clean lists. We mark it 725 * B_DONE to ensure that any asynchronization of the buffer properly 726 * clears B_DONE ( else a panic will occur later ). 727 * 728 * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which 729 * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() 730 * should only be called if the buffer is known-good. 731 * 732 * Since the buffer is not on a queue, we do not update the numfreebuffers 733 * count. 734 * 735 * Must be called at splbio(). 736 * The buffer must be on QUEUE_NONE. 737 */ 738 void 739 bdirty(bp) 740 struct buf *bp; 741 { 742 KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 743 bp->b_flags &= ~(B_READ|B_RELBUF); 744 745 if ((bp->b_flags & B_DELWRI) == 0) { 746 bp->b_flags |= B_DONE | B_DELWRI; 747 reassignbuf(bp, bp->b_vp); 748 ++numdirtybuffers; 749 bd_wakeup(hidirtybuffers); 750 } 751 } 752 753 /* 754 * bundirty: 755 * 756 * Clear B_DELWRI for buffer. 757 * 758 * Since the buffer is not on a queue, we do not update the numfreebuffers 759 * count. 760 * 761 * Must be called at splbio(). 762 * The buffer must be on QUEUE_NONE. 763 */ 764 765 void 766 bundirty(bp) 767 struct buf *bp; 768 { 769 KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); 770 771 if (bp->b_flags & B_DELWRI) { 772 bp->b_flags &= ~B_DELWRI; 773 reassignbuf(bp, bp->b_vp); 774 --numdirtybuffers; 775 numdirtywakeup(); 776 } 777 } 778 779 /* 780 * bawrite: 781 * 782 * Asynchronous write. Start output on a buffer, but do not wait for 783 * it to complete. The buffer is released when the output completes. 784 * 785 * bwrite() ( or the VOP routine anyway ) is responsible for handling 786 * B_INVAL buffers. Not us. 787 */ 788 void 789 bawrite(struct buf * bp) 790 { 791 bp->b_flags |= B_ASYNC; 792 (void) VOP_BWRITE(bp->b_vp, bp); 793 } 794 795 /* 796 * bowrite: 797 * 798 * Ordered write. Start output on a buffer, and flag it so that the 799 * device will write it in the order it was queued. The buffer is 800 * released when the output completes. bwrite() ( or the VOP routine 801 * anyway ) is responsible for handling B_INVAL buffers. 802 */ 803 int 804 bowrite(struct buf * bp) 805 { 806 bp->b_flags |= B_ORDERED | B_ASYNC; 807 return (VOP_BWRITE(bp->b_vp, bp)); 808 } 809 810 /* 811 * bwillwrite: 812 * 813 * Called prior to the locking of any vnodes when we are expecting to 814 * write. We do not want to starve the buffer cache with too many 815 * dirty buffers so we block here. By blocking prior to the locking 816 * of any vnodes we attempt to avoid the situation where a locked vnode 817 * prevents the various system daemons from flushing related buffers. 818 */ 819 820 void 821 bwillwrite(void) 822 { 823 int twenty = (hidirtybuffers - lodirtybuffers) / 5; 824 825 if (numdirtybuffers > hidirtybuffers + twenty) { 826 int s; 827 828 s = splbio(); 829 while (numdirtybuffers > hidirtybuffers) { 830 bd_wakeup(hidirtybuffers); 831 needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; 832 tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0); 833 } 834 splx(s); 835 } 836 } 837 838 /* 839 * brelse: 840 * 841 * Release a busy buffer and, if requested, free its resources. The 842 * buffer will be stashed in the appropriate bufqueue[] allowing it 843 * to be accessed later as a cache entity or reused for other purposes. 844 */ 845 void 846 brelse(struct buf * bp) 847 { 848 int s; 849 850 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 851 852 #if 0 853 if (bp->b_flags & B_CLUSTER) { 854 relpbuf(bp, NULL); 855 return; 856 } 857 #endif 858 859 s = splbio(); 860 861 if (bp->b_flags & B_LOCKED) 862 bp->b_flags &= ~B_ERROR; 863 864 if ((bp->b_flags & (B_READ | B_ERROR)) == B_ERROR) { 865 /* 866 * Failed write, redirty. Must clear B_ERROR to prevent 867 * pages from being scrapped. Note: B_INVAL is ignored 868 * here but will presumably be dealt with later. 869 */ 870 bp->b_flags &= ~B_ERROR; 871 bdirty(bp); 872 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) || 873 (bp->b_bufsize <= 0)) { 874 /* 875 * Either a failed I/O or we were asked to free or not 876 * cache the buffer. 877 */ 878 bp->b_flags |= B_INVAL; 879 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) 880 (*bioops.io_deallocate)(bp); 881 if (bp->b_flags & B_DELWRI) { 882 --numdirtybuffers; 883 numdirtywakeup(); 884 } 885 bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF); 886 if ((bp->b_flags & B_VMIO) == 0) { 887 if (bp->b_bufsize) 888 allocbuf(bp, 0); 889 if (bp->b_vp) 890 brelvp(bp); 891 } 892 } 893 894 /* 895 * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() 896 * is called with B_DELWRI set, the underlying pages may wind up 897 * getting freed causing a previous write (bdwrite()) to get 'lost' 898 * because pages associated with a B_DELWRI bp are marked clean. 899 * 900 * We still allow the B_INVAL case to call vfs_vmio_release(), even 901 * if B_DELWRI is set. 902 */ 903 904 if (bp->b_flags & B_DELWRI) 905 bp->b_flags &= ~B_RELBUF; 906 907 /* 908 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 909 * constituted, not even NFS buffers now. Two flags effect this. If 910 * B_INVAL, the struct buf is invalidated but the VM object is kept 911 * around ( i.e. so it is trivial to reconstitute the buffer later ). 912 * 913 * If B_ERROR or B_NOCACHE is set, pages in the VM object will be 914 * invalidated. B_ERROR cannot be set for a failed write unless the 915 * buffer is also B_INVAL because it hits the re-dirtying code above. 916 * 917 * Normally we can do this whether a buffer is B_DELWRI or not. If 918 * the buffer is an NFS buffer, it is tracking piecemeal writes or 919 * the commit state and we cannot afford to lose the buffer. 920 */ 921 if ((bp->b_flags & B_VMIO) 922 && !(bp->b_vp->v_tag == VT_NFS && 923 bp->b_vp->v_type != VBLK && 924 (bp->b_flags & B_DELWRI)) 925 ) { 926 927 int i, j, resid; 928 vm_page_t m; 929 off_t foff; 930 vm_pindex_t poff; 931 vm_object_t obj; 932 struct vnode *vp; 933 934 vp = bp->b_vp; 935 936 /* 937 * Get the base offset and length of the buffer. Note that 938 * for block sizes that are less then PAGE_SIZE, the b_data 939 * base of the buffer does not represent exactly b_offset and 940 * neither b_offset nor b_size are necessarily page aligned. 941 * Instead, the starting position of b_offset is: 942 * 943 * b_data + (b_offset & PAGE_MASK) 944 * 945 * block sizes less then DEV_BSIZE (usually 512) are not 946 * supported due to the page granularity bits (m->valid, 947 * m->dirty, etc...). 948 * 949 * See man buf(9) for more information 950 */ 951 952 resid = bp->b_bufsize; 953 foff = bp->b_offset; 954 955 for (i = 0; i < bp->b_npages; i++) { 956 m = bp->b_pages[i]; 957 vm_page_flag_clear(m, PG_ZERO); 958 if (m == bogus_page) { 959 960 obj = (vm_object_t) vp->v_object; 961 poff = OFF_TO_IDX(bp->b_offset); 962 963 for (j = i; j < bp->b_npages; j++) { 964 m = bp->b_pages[j]; 965 if (m == bogus_page) { 966 m = vm_page_lookup(obj, poff + j); 967 #if !defined(MAX_PERF) 968 if (!m) { 969 panic("brelse: page missing\n"); 970 } 971 #endif 972 bp->b_pages[j] = m; 973 } 974 } 975 976 if ((bp->b_flags & B_INVAL) == 0) { 977 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 978 } 979 } 980 if (bp->b_flags & (B_NOCACHE|B_ERROR)) { 981 int poffset = foff & PAGE_MASK; 982 int presid = resid > (PAGE_SIZE - poffset) ? 983 (PAGE_SIZE - poffset) : resid; 984 985 KASSERT(presid >= 0, ("brelse: extra page")); 986 vm_page_set_invalid(m, poffset, presid); 987 } 988 resid -= PAGE_SIZE - (foff & PAGE_MASK); 989 foff = (foff + PAGE_SIZE) & ~PAGE_MASK; 990 } 991 992 if (bp->b_flags & (B_INVAL | B_RELBUF)) 993 vfs_vmio_release(bp); 994 995 } else if (bp->b_flags & B_VMIO) { 996 997 if (bp->b_flags & (B_INVAL | B_RELBUF)) 998 vfs_vmio_release(bp); 999 1000 } 1001 1002 #if !defined(MAX_PERF) 1003 if (bp->b_qindex != QUEUE_NONE) 1004 panic("brelse: free buffer onto another queue???"); 1005 #endif 1006 if (BUF_REFCNT(bp) > 1) { 1007 /* Temporary panic to verify exclusive locking */ 1008 /* This panic goes away when we allow shared refs */ 1009 panic("brelse: multiple refs"); 1010 /* do not release to free list */ 1011 BUF_UNLOCK(bp); 1012 splx(s); 1013 return; 1014 } 1015 1016 /* enqueue */ 1017 1018 /* buffers with no memory */ 1019 if (bp->b_bufsize == 0) { 1020 bp->b_flags |= B_INVAL; 1021 if (bp->b_kvasize) 1022 bp->b_qindex = QUEUE_EMPTYKVA; 1023 else 1024 bp->b_qindex = QUEUE_EMPTY; 1025 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 1026 LIST_REMOVE(bp, b_hash); 1027 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 1028 bp->b_dev = NODEV; 1029 kvafreespace += bp->b_kvasize; 1030 if (bp->b_kvasize) 1031 kvaspacewakeup(); 1032 /* buffers with junk contents */ 1033 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { 1034 bp->b_flags |= B_INVAL; 1035 bp->b_qindex = QUEUE_CLEAN; 1036 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 1037 LIST_REMOVE(bp, b_hash); 1038 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 1039 bp->b_dev = NODEV; 1040 1041 /* buffers that are locked */ 1042 } else if (bp->b_flags & B_LOCKED) { 1043 bp->b_qindex = QUEUE_LOCKED; 1044 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 1045 1046 /* remaining buffers */ 1047 } else { 1048 switch(bp->b_flags & (B_DELWRI|B_AGE)) { 1049 case B_DELWRI | B_AGE: 1050 bp->b_qindex = QUEUE_DIRTY; 1051 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist); 1052 break; 1053 case B_DELWRI: 1054 bp->b_qindex = QUEUE_DIRTY; 1055 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); 1056 break; 1057 case B_AGE: 1058 bp->b_qindex = QUEUE_CLEAN; 1059 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 1060 break; 1061 default: 1062 bp->b_qindex = QUEUE_CLEAN; 1063 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 1064 break; 1065 } 1066 } 1067 1068 /* 1069 * If B_INVAL, clear B_DELWRI. We've already placed the buffer 1070 * on the correct queue. 1071 */ 1072 if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) { 1073 bp->b_flags &= ~B_DELWRI; 1074 --numdirtybuffers; 1075 numdirtywakeup(); 1076 } 1077 1078 runningbufspace -= bp->b_bufsize; 1079 1080 /* 1081 * Fixup numfreebuffers count. The bp is on an appropriate queue 1082 * unless locked. We then bump numfreebuffers if it is not B_DELWRI. 1083 * We've already handled the B_INVAL case ( B_DELWRI will be clear 1084 * if B_INVAL is set ). 1085 */ 1086 1087 if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI)) 1088 bufcountwakeup(); 1089 1090 /* 1091 * Something we can maybe free. 1092 */ 1093 1094 if (bp->b_bufsize) 1095 bufspacewakeup(); 1096 1097 /* unlock */ 1098 BUF_UNLOCK(bp); 1099 bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 1100 splx(s); 1101 } 1102 1103 /* 1104 * Release a buffer back to the appropriate queue but do not try to free 1105 * it. 1106 * 1107 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 1108 * biodone() to requeue an async I/O on completion. It is also used when 1109 * known good buffers need to be requeued but we think we may need the data 1110 * again soon. 1111 */ 1112 void 1113 bqrelse(struct buf * bp) 1114 { 1115 int s; 1116 1117 s = splbio(); 1118 1119 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1120 1121 #if !defined(MAX_PERF) 1122 if (bp->b_qindex != QUEUE_NONE) 1123 panic("bqrelse: free buffer onto another queue???"); 1124 #endif 1125 if (BUF_REFCNT(bp) > 1) { 1126 /* do not release to free list */ 1127 panic("bqrelse: multiple refs"); 1128 BUF_UNLOCK(bp); 1129 splx(s); 1130 return; 1131 } 1132 if (bp->b_flags & B_LOCKED) { 1133 bp->b_flags &= ~B_ERROR; 1134 bp->b_qindex = QUEUE_LOCKED; 1135 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 1136 /* buffers with stale but valid contents */ 1137 } else if (bp->b_flags & B_DELWRI) { 1138 bp->b_qindex = QUEUE_DIRTY; 1139 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); 1140 } else { 1141 bp->b_qindex = QUEUE_CLEAN; 1142 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 1143 } 1144 1145 runningbufspace -= bp->b_bufsize; 1146 1147 if ((bp->b_flags & B_LOCKED) == 0 && 1148 ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { 1149 bufcountwakeup(); 1150 } 1151 1152 /* 1153 * Something we can maybe wakeup 1154 */ 1155 if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) 1156 bufspacewakeup(); 1157 1158 /* unlock */ 1159 BUF_UNLOCK(bp); 1160 bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 1161 splx(s); 1162 } 1163 1164 static void 1165 vfs_vmio_release(bp) 1166 struct buf *bp; 1167 { 1168 int i, s; 1169 vm_page_t m; 1170 1171 s = splvm(); 1172 for (i = 0; i < bp->b_npages; i++) { 1173 m = bp->b_pages[i]; 1174 bp->b_pages[i] = NULL; 1175 /* 1176 * In order to keep page LRU ordering consistent, put 1177 * everything on the inactive queue. 1178 */ 1179 vm_page_unwire(m, 0); 1180 /* 1181 * We don't mess with busy pages, it is 1182 * the responsibility of the process that 1183 * busied the pages to deal with them. 1184 */ 1185 if ((m->flags & PG_BUSY) || (m->busy != 0)) 1186 continue; 1187 1188 if (m->wire_count == 0) { 1189 vm_page_flag_clear(m, PG_ZERO); 1190 /* 1191 * Might as well free the page if we can and it has 1192 * no valid data. 1193 */ 1194 if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { 1195 vm_page_busy(m); 1196 vm_page_protect(m, VM_PROT_NONE); 1197 vm_page_free(m); 1198 } 1199 } 1200 } 1201 bufspace -= bp->b_bufsize; 1202 vmiospace -= bp->b_bufsize; 1203 runningbufspace -= bp->b_bufsize; 1204 splx(s); 1205 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 1206 if (bp->b_bufsize) 1207 bufspacewakeup(); 1208 bp->b_npages = 0; 1209 bp->b_bufsize = 0; 1210 bp->b_flags &= ~B_VMIO; 1211 if (bp->b_vp) 1212 brelvp(bp); 1213 } 1214 1215 /* 1216 * Check to see if a block is currently memory resident. 1217 */ 1218 struct buf * 1219 gbincore(struct vnode * vp, daddr_t blkno) 1220 { 1221 struct buf *bp; 1222 struct bufhashhdr *bh; 1223 1224 bh = bufhash(vp, blkno); 1225 bp = bh->lh_first; 1226 1227 /* Search hash chain */ 1228 while (bp != NULL) { 1229 /* hit */ 1230 if (bp->b_vp == vp && bp->b_lblkno == blkno && 1231 (bp->b_flags & B_INVAL) == 0) { 1232 break; 1233 } 1234 bp = bp->b_hash.le_next; 1235 } 1236 return (bp); 1237 } 1238 1239 /* 1240 * vfs_bio_awrite: 1241 * 1242 * Implement clustered async writes for clearing out B_DELWRI buffers. 1243 * This is much better then the old way of writing only one buffer at 1244 * a time. Note that we may not be presented with the buffers in the 1245 * correct order, so we search for the cluster in both directions. 1246 */ 1247 int 1248 vfs_bio_awrite(struct buf * bp) 1249 { 1250 int i; 1251 int j; 1252 daddr_t lblkno = bp->b_lblkno; 1253 struct vnode *vp = bp->b_vp; 1254 int s; 1255 int ncl; 1256 struct buf *bpa; 1257 int nwritten; 1258 int size; 1259 int maxcl; 1260 1261 s = splbio(); 1262 /* 1263 * right now we support clustered writing only to regular files. If 1264 * we find a clusterable block we could be in the middle of a cluster 1265 * rather then at the beginning. 1266 */ 1267 if ((vp->v_type == VREG) && 1268 (vp->v_mount != 0) && /* Only on nodes that have the size info */ 1269 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 1270 1271 size = vp->v_mount->mnt_stat.f_iosize; 1272 maxcl = MAXPHYS / size; 1273 1274 for (i = 1; i < maxcl; i++) { 1275 if ((bpa = gbincore(vp, lblkno + i)) && 1276 BUF_REFCNT(bpa) == 0 && 1277 ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 1278 (B_DELWRI | B_CLUSTEROK)) && 1279 (bpa->b_bufsize == size)) { 1280 if ((bpa->b_blkno == bpa->b_lblkno) || 1281 (bpa->b_blkno != 1282 bp->b_blkno + ((i * size) >> DEV_BSHIFT))) 1283 break; 1284 } else { 1285 break; 1286 } 1287 } 1288 for (j = 1; i + j <= maxcl && j <= lblkno; j++) { 1289 if ((bpa = gbincore(vp, lblkno - j)) && 1290 BUF_REFCNT(bpa) == 0 && 1291 ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 1292 (B_DELWRI | B_CLUSTEROK)) && 1293 (bpa->b_bufsize == size)) { 1294 if ((bpa->b_blkno == bpa->b_lblkno) || 1295 (bpa->b_blkno != 1296 bp->b_blkno - ((j * size) >> DEV_BSHIFT))) 1297 break; 1298 } else { 1299 break; 1300 } 1301 } 1302 --j; 1303 ncl = i + j; 1304 /* 1305 * this is a possible cluster write 1306 */ 1307 if (ncl != 1) { 1308 nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); 1309 splx(s); 1310 return nwritten; 1311 } 1312 } 1313 1314 BUF_LOCK(bp, LK_EXCLUSIVE); 1315 bremfree(bp); 1316 bp->b_flags |= B_ASYNC; 1317 1318 splx(s); 1319 /* 1320 * default (old) behavior, writing out only one block 1321 * 1322 * XXX returns b_bufsize instead of b_bcount for nwritten? 1323 */ 1324 nwritten = bp->b_bufsize; 1325 (void) VOP_BWRITE(bp->b_vp, bp); 1326 1327 return nwritten; 1328 } 1329 1330 /* 1331 * getnewbuf: 1332 * 1333 * Find and initialize a new buffer header, freeing up existing buffers 1334 * in the bufqueues as necessary. The new buffer is returned locked. 1335 * 1336 * Important: B_INVAL is not set. If the caller wishes to throw the 1337 * buffer away, the caller must set B_INVAL prior to calling brelse(). 1338 * 1339 * We block if: 1340 * We have insufficient buffer headers 1341 * We have insufficient buffer space 1342 * buffer_map is too fragmented ( space reservation fails ) 1343 * If we have to flush dirty buffers ( but we try to avoid this ) 1344 * 1345 * To avoid VFS layer recursion we do not flush dirty buffers ourselves. 1346 * Instead we ask the buf daemon to do it for us. We attempt to 1347 * avoid piecemeal wakeups of the pageout daemon. 1348 */ 1349 1350 static struct buf * 1351 getnewbuf(int slpflag, int slptimeo, int size, int maxsize) 1352 { 1353 struct buf *bp; 1354 struct buf *nbp; 1355 struct buf *dbp; 1356 int outofspace; 1357 int nqindex; 1358 int defrag = 0; 1359 1360 ++getnewbufcalls; 1361 --getnewbufrestarts; 1362 restart: 1363 ++getnewbufrestarts; 1364 1365 /* 1366 * Calculate whether we are out of buffer space. This state is 1367 * recalculated on every restart. If we are out of space, we 1368 * have to turn off defragmentation. Setting defrag to -1 when 1369 * outofspace is positive means "defrag while freeing buffers". 1370 * The looping conditional will be muffed up if defrag is left 1371 * positive when outofspace is positive. 1372 */ 1373 1374 dbp = NULL; 1375 outofspace = 0; 1376 if (bufspace >= hibufspace) { 1377 if ((curproc->p_flag & P_BUFEXHAUST) == 0 || 1378 bufspace >= maxbufspace) { 1379 outofspace = 1; 1380 if (defrag > 0) 1381 defrag = -1; 1382 } 1383 } 1384 1385 /* 1386 * defrag state is semi-persistant. 1 means we are flagged for 1387 * defragging. -1 means we actually defragged something. 1388 */ 1389 /* nop */ 1390 1391 /* 1392 * Setup for scan. If we do not have enough free buffers, 1393 * we setup a degenerate case that immediately fails. Note 1394 * that if we are specially marked process, we are allowed to 1395 * dip into our reserves. 1396 * 1397 * Normally we want to find an EMPTYKVA buffer. That is, a 1398 * buffer with kva already allocated. If there are no EMPTYKVA 1399 * buffers we back up to the truely EMPTY buffers. When defragging 1400 * we do not bother backing up since we have to locate buffers with 1401 * kva to defrag. If we are out of space we skip both EMPTY and 1402 * EMPTYKVA and dig right into the CLEAN queue. 1403 * 1404 * In this manner we avoid scanning unnecessary buffers. It is very 1405 * important for us to do this because the buffer cache is almost 1406 * constantly out of space or in need of defragmentation. 1407 */ 1408 1409 if ((curproc->p_flag & P_BUFEXHAUST) == 0 && 1410 numfreebuffers < lofreebuffers) { 1411 nqindex = QUEUE_CLEAN; 1412 nbp = NULL; 1413 } else { 1414 nqindex = QUEUE_EMPTYKVA; 1415 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); 1416 if (nbp == NULL) { 1417 if (defrag <= 0) { 1418 nqindex = QUEUE_EMPTY; 1419 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); 1420 } 1421 } 1422 if (outofspace || nbp == NULL) { 1423 nqindex = QUEUE_CLEAN; 1424 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); 1425 } 1426 } 1427 1428 /* 1429 * Run scan, possibly freeing data and/or kva mappings on the fly 1430 * depending. 1431 */ 1432 1433 while ((bp = nbp) != NULL) { 1434 int qindex = nqindex; 1435 1436 /* 1437 * Calculate next bp ( we can only use it if we do not block 1438 * or do other fancy things ). 1439 */ 1440 if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { 1441 switch(qindex) { 1442 case QUEUE_EMPTY: 1443 nqindex = QUEUE_EMPTYKVA; 1444 if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) 1445 break; 1446 /* fall through */ 1447 case QUEUE_EMPTYKVA: 1448 nqindex = QUEUE_CLEAN; 1449 if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) 1450 break; 1451 /* fall through */ 1452 case QUEUE_CLEAN: 1453 /* 1454 * nbp is NULL. 1455 */ 1456 break; 1457 } 1458 } 1459 1460 /* 1461 * Sanity Checks 1462 */ 1463 KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); 1464 1465 /* 1466 * Note: we no longer distinguish between VMIO and non-VMIO 1467 * buffers. 1468 */ 1469 1470 KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); 1471 1472 /* 1473 * If we are defragging and the buffer isn't useful for fixing 1474 * that problem we continue. If we are out of space and the 1475 * buffer isn't useful for fixing that problem we continue. 1476 */ 1477 1478 if (defrag > 0 && bp->b_kvasize == 0) 1479 continue; 1480 if (outofspace > 0 && bp->b_bufsize == 0) 1481 continue; 1482 1483 /* 1484 * Start freeing the bp. This is somewhat involved. nbp 1485 * remains valid only for QUEUE_EMPTY[KVA] bp's. 1486 */ 1487 1488 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) 1489 panic("getnewbuf: locked buf"); 1490 bremfree(bp); 1491 1492 if (qindex == QUEUE_CLEAN) { 1493 if (bp->b_flags & B_VMIO) { 1494 bp->b_flags &= ~B_ASYNC; 1495 vfs_vmio_release(bp); 1496 } 1497 if (bp->b_vp) 1498 brelvp(bp); 1499 } 1500 1501 /* 1502 * NOTE: nbp is now entirely invalid. We can only restart 1503 * the scan from this point on. 1504 * 1505 * Get the rest of the buffer freed up. b_kva* is still 1506 * valid after this operation. 1507 */ 1508 1509 if (bp->b_rcred != NOCRED) { 1510 crfree(bp->b_rcred); 1511 bp->b_rcred = NOCRED; 1512 } 1513 if (bp->b_wcred != NOCRED) { 1514 crfree(bp->b_wcred); 1515 bp->b_wcred = NOCRED; 1516 } 1517 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) 1518 (*bioops.io_deallocate)(bp); 1519 LIST_REMOVE(bp, b_hash); 1520 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 1521 1522 if (bp->b_bufsize) 1523 allocbuf(bp, 0); 1524 1525 bp->b_flags = 0; 1526 bp->b_dev = NODEV; 1527 bp->b_vp = NULL; 1528 bp->b_blkno = bp->b_lblkno = 0; 1529 bp->b_offset = NOOFFSET; 1530 bp->b_iodone = 0; 1531 bp->b_error = 0; 1532 bp->b_resid = 0; 1533 bp->b_bcount = 0; 1534 bp->b_npages = 0; 1535 bp->b_dirtyoff = bp->b_dirtyend = 0; 1536 1537 LIST_INIT(&bp->b_dep); 1538 1539 /* 1540 * Ok, now that we have a free buffer, if we are defragging 1541 * we have to recover the kvaspace. If we are out of space 1542 * we have to free the buffer (which we just did), but we 1543 * do not have to recover kva space unless we hit a defrag 1544 * hicup. Being able to avoid freeing the kva space leads 1545 * to a significant reduction in overhead. 1546 */ 1547 1548 if (defrag > 0) { 1549 defrag = -1; 1550 bp->b_flags |= B_INVAL; 1551 bfreekva(bp); 1552 brelse(bp); 1553 goto restart; 1554 } 1555 1556 if (outofspace > 0) { 1557 outofspace = -1; 1558 bp->b_flags |= B_INVAL; 1559 if (defrag < 0) 1560 bfreekva(bp); 1561 brelse(bp); 1562 goto restart; 1563 } 1564 1565 /* 1566 * We are done 1567 */ 1568 break; 1569 } 1570 1571 /* 1572 * If we exhausted our list, sleep as appropriate. We may have to 1573 * wakeup various daemons and write out some dirty buffers. 1574 * 1575 * Generally we are sleeping due to insufficient buffer space. 1576 */ 1577 1578 if (bp == NULL) { 1579 int flags; 1580 char *waitmsg; 1581 1582 dosleep: 1583 if (defrag > 0) { 1584 flags = VFS_BIO_NEED_KVASPACE; 1585 waitmsg = "nbufkv"; 1586 } else if (outofspace > 0) { 1587 waitmsg = "nbufbs"; 1588 flags = VFS_BIO_NEED_BUFSPACE; 1589 } else { 1590 waitmsg = "newbuf"; 1591 flags = VFS_BIO_NEED_ANY; 1592 } 1593 1594 /* XXX */ 1595 1596 (void) speedup_syncer(); 1597 needsbuffer |= flags; 1598 while (needsbuffer & flags) { 1599 if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, 1600 waitmsg, slptimeo)) 1601 return (NULL); 1602 } 1603 } else { 1604 /* 1605 * We finally have a valid bp. We aren't quite out of the 1606 * woods, we still have to reserve kva space. 1607 */ 1608 vm_offset_t addr = 0; 1609 1610 maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; 1611 1612 if (maxsize != bp->b_kvasize) { 1613 bfreekva(bp); 1614 1615 if (vm_map_findspace(buffer_map, 1616 vm_map_min(buffer_map), maxsize, &addr)) { 1617 /* 1618 * Uh oh. Buffer map is to fragmented. Try 1619 * to defragment. 1620 */ 1621 if (defrag <= 0) { 1622 defrag = 1; 1623 bp->b_flags |= B_INVAL; 1624 brelse(bp); 1625 goto restart; 1626 } 1627 /* 1628 * Uh oh. We couldn't seem to defragment 1629 */ 1630 bp = NULL; 1631 goto dosleep; 1632 } 1633 } 1634 if (addr) { 1635 vm_map_insert(buffer_map, NULL, 0, 1636 addr, addr + maxsize, 1637 VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); 1638 1639 bp->b_kvabase = (caddr_t) addr; 1640 bp->b_kvasize = maxsize; 1641 } 1642 bp->b_data = bp->b_kvabase; 1643 } 1644 return(bp); 1645 } 1646 1647 /* 1648 * waitfreebuffers: 1649 * 1650 * Wait for sufficient free buffers. Only called from normal processes. 1651 */ 1652 1653 static void 1654 waitfreebuffers(int slpflag, int slptimeo) 1655 { 1656 while (numfreebuffers < hifreebuffers) { 1657 if (numfreebuffers >= hifreebuffers) 1658 break; 1659 needsbuffer |= VFS_BIO_NEED_FREE; 1660 if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo)) 1661 break; 1662 } 1663 } 1664 1665 /* 1666 * buf_daemon: 1667 * 1668 * buffer flushing daemon. Buffers are normally flushed by the 1669 * update daemon but if it cannot keep up this process starts to 1670 * take the load in an attempt to prevent getnewbuf() from blocking. 1671 */ 1672 1673 static struct proc *bufdaemonproc; 1674 static int bd_interval; 1675 static int bd_flushto; 1676 1677 static struct kproc_desc buf_kp = { 1678 "bufdaemon", 1679 buf_daemon, 1680 &bufdaemonproc 1681 }; 1682 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) 1683 1684 static void 1685 buf_daemon() 1686 { 1687 int s; 1688 /* 1689 * This process is allowed to take the buffer cache to the limit 1690 */ 1691 curproc->p_flag |= P_BUFEXHAUST; 1692 s = splbio(); 1693 1694 bd_interval = 5 * hz; /* dynamically adjusted */ 1695 bd_flushto = hidirtybuffers; /* dynamically adjusted */ 1696 1697 while (TRUE) { 1698 bd_request = 0; 1699 1700 /* 1701 * Do the flush. Limit the number of buffers we flush in one 1702 * go. The failure condition occurs when processes are writing 1703 * buffers faster then we can dispose of them. In this case 1704 * we may be flushing so often that the previous set of flushes 1705 * have not had time to complete, causing us to run out of 1706 * physical buffers and block. 1707 */ 1708 { 1709 int runcount = maxbdrun; 1710 1711 while (numdirtybuffers > bd_flushto && runcount) { 1712 --runcount; 1713 if (flushbufqueues() == 0) 1714 break; 1715 } 1716 } 1717 1718 /* 1719 * If nobody is requesting anything we sleep 1720 */ 1721 if (bd_request == 0) 1722 tsleep(&bd_request, PVM, "psleep", bd_interval); 1723 1724 /* 1725 * We calculate how much to add or subtract from bd_flushto 1726 * and bd_interval based on how far off we are from the 1727 * optimal number of dirty buffers, which is 20% below the 1728 * hidirtybuffers mark. We cannot use hidirtybuffers straight 1729 * because being right on the mark will cause getnewbuf() 1730 * to oscillate our wakeup. 1731 * 1732 * The larger the error in either direction, the more we adjust 1733 * bd_flushto and bd_interval. The time interval is adjusted 1734 * by 2 seconds per whole-buffer-range of error. This is an 1735 * exponential convergence algorithm, with large errors 1736 * producing large changes and small errors producing small 1737 * changes. 1738 */ 1739 1740 { 1741 int brange = hidirtybuffers - lodirtybuffers; 1742 int middb = hidirtybuffers - brange / 5; 1743 int deltabuf = middb - numdirtybuffers; 1744 1745 bd_flushto += deltabuf / 20; 1746 bd_interval += deltabuf * (2 * hz) / (brange * 1); 1747 } 1748 if (bd_flushto < lodirtybuffers) 1749 bd_flushto = lodirtybuffers; 1750 if (bd_flushto > hidirtybuffers) 1751 bd_flushto = hidirtybuffers; 1752 if (bd_interval < hz / 10) 1753 bd_interval = hz / 10; 1754 if (bd_interval > 5 * hz) 1755 bd_interval = 5 * hz; 1756 } 1757 } 1758 1759 /* 1760 * flushbufqueues: 1761 * 1762 * Try to flush a buffer in the dirty queue. We must be careful to 1763 * free up B_INVAL buffers instead of write them, which NFS is 1764 * particularly sensitive to. 1765 */ 1766 1767 static int 1768 flushbufqueues(void) 1769 { 1770 struct buf *bp; 1771 int r = 0; 1772 1773 bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]); 1774 1775 while (bp) { 1776 KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); 1777 if ((bp->b_flags & B_DELWRI) != 0) { 1778 if (bp->b_flags & B_INVAL) { 1779 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) 1780 panic("flushbufqueues: locked buf"); 1781 bremfree(bp); 1782 brelse(bp); 1783 ++r; 1784 break; 1785 } 1786 vfs_bio_awrite(bp); 1787 ++r; 1788 break; 1789 } 1790 bp = TAILQ_NEXT(bp, b_freelist); 1791 } 1792 return(r); 1793 } 1794 1795 /* 1796 * Check to see if a block is currently memory resident. 1797 */ 1798 struct buf * 1799 incore(struct vnode * vp, daddr_t blkno) 1800 { 1801 struct buf *bp; 1802 1803 int s = splbio(); 1804 bp = gbincore(vp, blkno); 1805 splx(s); 1806 return (bp); 1807 } 1808 1809 /* 1810 * Returns true if no I/O is needed to access the 1811 * associated VM object. This is like incore except 1812 * it also hunts around in the VM system for the data. 1813 */ 1814 1815 int 1816 inmem(struct vnode * vp, daddr_t blkno) 1817 { 1818 vm_object_t obj; 1819 vm_offset_t toff, tinc, size; 1820 vm_page_t m; 1821 vm_ooffset_t off; 1822 1823 if (incore(vp, blkno)) 1824 return 1; 1825 if (vp->v_mount == NULL) 1826 return 0; 1827 if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0) 1828 return 0; 1829 1830 obj = vp->v_object; 1831 size = PAGE_SIZE; 1832 if (size > vp->v_mount->mnt_stat.f_iosize) 1833 size = vp->v_mount->mnt_stat.f_iosize; 1834 off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; 1835 1836 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 1837 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 1838 if (!m) 1839 return 0; 1840 tinc = size; 1841 if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) 1842 tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); 1843 if (vm_page_is_valid(m, 1844 (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) 1845 return 0; 1846 } 1847 return 1; 1848 } 1849 1850 /* 1851 * vfs_setdirty: 1852 * 1853 * Sets the dirty range for a buffer based on the status of the dirty 1854 * bits in the pages comprising the buffer. 1855 * 1856 * The range is limited to the size of the buffer. 1857 * 1858 * This routine is primarily used by NFS, but is generalized for the 1859 * B_VMIO case. 1860 */ 1861 static void 1862 vfs_setdirty(struct buf *bp) 1863 { 1864 int i; 1865 vm_object_t object; 1866 1867 /* 1868 * Degenerate case - empty buffer 1869 */ 1870 1871 if (bp->b_bufsize == 0) 1872 return; 1873 1874 /* 1875 * We qualify the scan for modified pages on whether the 1876 * object has been flushed yet. The OBJ_WRITEABLE flag 1877 * is not cleared simply by protecting pages off. 1878 */ 1879 1880 if ((bp->b_flags & B_VMIO) == 0) 1881 return; 1882 1883 object = bp->b_pages[0]->object; 1884 1885 if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) 1886 printf("Warning: object %p writeable but not mightbedirty\n", object); 1887 if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) 1888 printf("Warning: object %p mightbedirty but not writeable\n", object); 1889 1890 if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { 1891 vm_offset_t boffset; 1892 vm_offset_t eoffset; 1893 1894 /* 1895 * test the pages to see if they have been modified directly 1896 * by users through the VM system. 1897 */ 1898 for (i = 0; i < bp->b_npages; i++) { 1899 vm_page_flag_clear(bp->b_pages[i], PG_ZERO); 1900 vm_page_test_dirty(bp->b_pages[i]); 1901 } 1902 1903 /* 1904 * Calculate the encompassing dirty range, boffset and eoffset, 1905 * (eoffset - boffset) bytes. 1906 */ 1907 1908 for (i = 0; i < bp->b_npages; i++) { 1909 if (bp->b_pages[i]->dirty) 1910 break; 1911 } 1912 boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 1913 1914 for (i = bp->b_npages - 1; i >= 0; --i) { 1915 if (bp->b_pages[i]->dirty) { 1916 break; 1917 } 1918 } 1919 eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 1920 1921 /* 1922 * Fit it to the buffer. 1923 */ 1924 1925 if (eoffset > bp->b_bcount) 1926 eoffset = bp->b_bcount; 1927 1928 /* 1929 * If we have a good dirty range, merge with the existing 1930 * dirty range. 1931 */ 1932 1933 if (boffset < eoffset) { 1934 if (bp->b_dirtyoff > boffset) 1935 bp->b_dirtyoff = boffset; 1936 if (bp->b_dirtyend < eoffset) 1937 bp->b_dirtyend = eoffset; 1938 } 1939 } 1940 } 1941 1942 /* 1943 * getblk: 1944 * 1945 * Get a block given a specified block and offset into a file/device. 1946 * The buffers B_DONE bit will be cleared on return, making it almost 1947 * ready for an I/O initiation. B_INVAL may or may not be set on 1948 * return. The caller should clear B_INVAL prior to initiating a 1949 * READ. 1950 * 1951 * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 1952 * an existing buffer. 1953 * 1954 * For a VMIO buffer, B_CACHE is modified according to the backing VM. 1955 * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 1956 * and then cleared based on the backing VM. If the previous buffer is 1957 * non-0-sized but invalid, B_CACHE will be cleared. 1958 * 1959 * If getblk() must create a new buffer, the new buffer is returned with 1960 * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 1961 * case it is returned with B_INVAL clear and B_CACHE set based on the 1962 * backing VM. 1963 * 1964 * getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos 1965 * B_CACHE bit is clear. 1966 * 1967 * What this means, basically, is that the caller should use B_CACHE to 1968 * determine whether the buffer is fully valid or not and should clear 1969 * B_INVAL prior to issuing a read. If the caller intends to validate 1970 * the buffer by loading its data area with something, the caller needs 1971 * to clear B_INVAL. If the caller does this without issuing an I/O, 1972 * the caller should set B_CACHE ( as an optimization ), else the caller 1973 * should issue the I/O and biodone() will set B_CACHE if the I/O was 1974 * a write attempt or if it was a successfull read. If the caller 1975 * intends to issue a READ, the caller must clear B_INVAL and B_ERROR 1976 * prior to issuing the READ. biodone() will *not* clear B_INVAL. 1977 */ 1978 struct buf * 1979 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 1980 { 1981 struct buf *bp; 1982 int s; 1983 struct bufhashhdr *bh; 1984 1985 #if !defined(MAX_PERF) 1986 if (size > MAXBSIZE) 1987 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); 1988 #endif 1989 1990 s = splbio(); 1991 loop: 1992 /* 1993 * Block if we are low on buffers. Certain processes are allowed 1994 * to completely exhaust the buffer cache. 1995 */ 1996 if (curproc->p_flag & P_BUFEXHAUST) { 1997 if (numfreebuffers == 0) { 1998 needsbuffer |= VFS_BIO_NEED_ANY; 1999 tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", 2000 slptimeo); 2001 } 2002 } else if (numfreebuffers < lofreebuffers) { 2003 waitfreebuffers(slpflag, slptimeo); 2004 } 2005 2006 if ((bp = gbincore(vp, blkno))) { 2007 /* 2008 * Buffer is in-core. If the buffer is not busy, it must 2009 * be on a queue. 2010 */ 2011 2012 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 2013 if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, 2014 "getblk", slpflag, slptimeo) == ENOLCK) 2015 goto loop; 2016 splx(s); 2017 return (struct buf *) NULL; 2018 } 2019 2020 /* 2021 * The buffer is locked. B_CACHE is cleared if the buffer is 2022 * invalid. Ohterwise, for a non-VMIO buffer, B_CACHE is set 2023 * and for a VMIO buffer B_CACHE is adjusted according to the 2024 * backing VM cache. 2025 */ 2026 if (bp->b_flags & B_INVAL) 2027 bp->b_flags &= ~B_CACHE; 2028 else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) 2029 bp->b_flags |= B_CACHE; 2030 bremfree(bp); 2031 2032 /* 2033 * check for size inconsistancies for non-VMIO case. 2034 */ 2035 2036 if (bp->b_bcount != size) { 2037 if ((bp->b_flags & B_VMIO) == 0 || 2038 (size > bp->b_kvasize)) { 2039 if (bp->b_flags & B_DELWRI) { 2040 bp->b_flags |= B_NOCACHE; 2041 VOP_BWRITE(bp->b_vp, bp); 2042 } else { 2043 if ((bp->b_flags & B_VMIO) && 2044 (LIST_FIRST(&bp->b_dep) == NULL)) { 2045 bp->b_flags |= B_RELBUF; 2046 brelse(bp); 2047 } else { 2048 bp->b_flags |= B_NOCACHE; 2049 VOP_BWRITE(bp->b_vp, bp); 2050 } 2051 } 2052 goto loop; 2053 } 2054 } 2055 2056 /* 2057 * If the size is inconsistant in the VMIO case, we can resize 2058 * the buffer. This might lead to B_CACHE getting set or 2059 * cleared. If the size has not changed, B_CACHE remains 2060 * unchanged from its previous state. 2061 */ 2062 2063 if (bp->b_bcount != size) 2064 allocbuf(bp, size); 2065 2066 KASSERT(bp->b_offset != NOOFFSET, 2067 ("getblk: no buffer offset")); 2068 2069 /* 2070 * A buffer with B_DELWRI set and B_CACHE clear must 2071 * be committed before we can return the buffer in 2072 * order to prevent the caller from issuing a read 2073 * ( due to B_CACHE not being set ) and overwriting 2074 * it. 2075 * 2076 * Most callers, including NFS and FFS, need this to 2077 * operate properly either because they assume they 2078 * can issue a read if B_CACHE is not set, or because 2079 * ( for example ) an uncached B_DELWRI might loop due 2080 * to softupdates re-dirtying the buffer. In the latter 2081 * case, B_CACHE is set after the first write completes, 2082 * preventing further loops. 2083 */ 2084 2085 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 2086 VOP_BWRITE(bp->b_vp, bp); 2087 goto loop; 2088 } 2089 2090 splx(s); 2091 bp->b_flags &= ~B_DONE; 2092 } else { 2093 /* 2094 * Buffer is not in-core, create new buffer. The buffer 2095 * returned by getnewbuf() is locked. Note that the returned 2096 * buffer is also considered valid (not marked B_INVAL). 2097 */ 2098 int bsize, maxsize, vmio; 2099 off_t offset; 2100 2101 if (vp->v_type == VBLK) 2102 bsize = DEV_BSIZE; 2103 else if (vp->v_mountedhere) 2104 bsize = vp->v_mountedhere->mnt_stat.f_iosize; 2105 else if (vp->v_mount) 2106 bsize = vp->v_mount->mnt_stat.f_iosize; 2107 else 2108 bsize = size; 2109 2110 offset = (off_t)blkno * bsize; 2111 vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF); 2112 maxsize = vmio ? size + (offset & PAGE_MASK) : size; 2113 maxsize = imax(maxsize, bsize); 2114 2115 if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) { 2116 if (slpflag || slptimeo) { 2117 splx(s); 2118 return NULL; 2119 } 2120 goto loop; 2121 } 2122 2123 /* 2124 * This code is used to make sure that a buffer is not 2125 * created while the getnewbuf routine is blocked. 2126 * This can be a problem whether the vnode is locked or not. 2127 * If the buffer is created out from under us, we have to 2128 * throw away the one we just created. There is now window 2129 * race because we are safely running at splbio() from the 2130 * point of the duplicate buffer creation through to here, 2131 * and we've locked the buffer. 2132 */ 2133 if (gbincore(vp, blkno)) { 2134 bp->b_flags |= B_INVAL; 2135 brelse(bp); 2136 goto loop; 2137 } 2138 2139 /* 2140 * Insert the buffer into the hash, so that it can 2141 * be found by incore. 2142 */ 2143 bp->b_blkno = bp->b_lblkno = blkno; 2144 bp->b_offset = offset; 2145 2146 bgetvp(vp, bp); 2147 LIST_REMOVE(bp, b_hash); 2148 bh = bufhash(vp, blkno); 2149 LIST_INSERT_HEAD(bh, bp, b_hash); 2150 2151 /* 2152 * set B_VMIO bit. allocbuf() the buffer bigger. Since the 2153 * buffer size starts out as 0, B_CACHE will be set by 2154 * allocbuf() for the VMIO case prior to it testing the 2155 * backing store for validity. 2156 */ 2157 2158 if (vmio) { 2159 bp->b_flags |= B_VMIO; 2160 #if defined(VFS_BIO_DEBUG) 2161 if (vp->v_type != VREG && vp->v_type != VBLK) 2162 printf("getblk: vmioing file type %d???\n", vp->v_type); 2163 #endif 2164 } else { 2165 bp->b_flags &= ~B_VMIO; 2166 } 2167 2168 allocbuf(bp, size); 2169 2170 splx(s); 2171 bp->b_flags &= ~B_DONE; 2172 } 2173 return (bp); 2174 } 2175 2176 /* 2177 * Get an empty, disassociated buffer of given size. The buffer is initially 2178 * set to B_INVAL. 2179 */ 2180 struct buf * 2181 geteblk(int size) 2182 { 2183 struct buf *bp; 2184 int s; 2185 2186 s = splbio(); 2187 while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0); 2188 splx(s); 2189 allocbuf(bp, size); 2190 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 2191 return (bp); 2192 } 2193 2194 2195 /* 2196 * This code constitutes the buffer memory from either anonymous system 2197 * memory (in the case of non-VMIO operations) or from an associated 2198 * VM object (in the case of VMIO operations). This code is able to 2199 * resize a buffer up or down. 2200 * 2201 * Note that this code is tricky, and has many complications to resolve 2202 * deadlock or inconsistant data situations. Tread lightly!!! 2203 * There are B_CACHE and B_DELWRI interactions that must be dealt with by 2204 * the caller. Calling this code willy nilly can result in the loss of data. 2205 * 2206 * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 2207 * B_CACHE for the non-VMIO case. 2208 */ 2209 2210 int 2211 allocbuf(struct buf *bp, int size) 2212 { 2213 int newbsize, mbsize; 2214 int i; 2215 2216 #if !defined(MAX_PERF) 2217 if (BUF_REFCNT(bp) == 0) 2218 panic("allocbuf: buffer not busy"); 2219 2220 if (bp->b_kvasize < size) 2221 panic("allocbuf: buffer too small"); 2222 #endif 2223 2224 if ((bp->b_flags & B_VMIO) == 0) { 2225 caddr_t origbuf; 2226 int origbufsize; 2227 /* 2228 * Just get anonymous memory from the kernel. Don't 2229 * mess with B_CACHE. 2230 */ 2231 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 2232 #if !defined(NO_B_MALLOC) 2233 if (bp->b_flags & B_MALLOC) 2234 newbsize = mbsize; 2235 else 2236 #endif 2237 newbsize = round_page(size); 2238 2239 if (newbsize < bp->b_bufsize) { 2240 #if !defined(NO_B_MALLOC) 2241 /* 2242 * malloced buffers are not shrunk 2243 */ 2244 if (bp->b_flags & B_MALLOC) { 2245 if (newbsize) { 2246 bp->b_bcount = size; 2247 } else { 2248 free(bp->b_data, M_BIOBUF); 2249 bufspace -= bp->b_bufsize; 2250 bufmallocspace -= bp->b_bufsize; 2251 runningbufspace -= bp->b_bufsize; 2252 if (bp->b_bufsize) 2253 bufspacewakeup(); 2254 bp->b_data = bp->b_kvabase; 2255 bp->b_bufsize = 0; 2256 bp->b_bcount = 0; 2257 bp->b_flags &= ~B_MALLOC; 2258 } 2259 return 1; 2260 } 2261 #endif 2262 vm_hold_free_pages( 2263 bp, 2264 (vm_offset_t) bp->b_data + newbsize, 2265 (vm_offset_t) bp->b_data + bp->b_bufsize); 2266 } else if (newbsize > bp->b_bufsize) { 2267 #if !defined(NO_B_MALLOC) 2268 /* 2269 * We only use malloced memory on the first allocation. 2270 * and revert to page-allocated memory when the buffer 2271 * grows. 2272 */ 2273 if ( (bufmallocspace < maxbufmallocspace) && 2274 (bp->b_bufsize == 0) && 2275 (mbsize <= PAGE_SIZE/2)) { 2276 2277 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); 2278 bp->b_bufsize = mbsize; 2279 bp->b_bcount = size; 2280 bp->b_flags |= B_MALLOC; 2281 bufspace += mbsize; 2282 bufmallocspace += mbsize; 2283 runningbufspace += bp->b_bufsize; 2284 return 1; 2285 } 2286 #endif 2287 origbuf = NULL; 2288 origbufsize = 0; 2289 #if !defined(NO_B_MALLOC) 2290 /* 2291 * If the buffer is growing on its other-than-first allocation, 2292 * then we revert to the page-allocation scheme. 2293 */ 2294 if (bp->b_flags & B_MALLOC) { 2295 origbuf = bp->b_data; 2296 origbufsize = bp->b_bufsize; 2297 bp->b_data = bp->b_kvabase; 2298 bufspace -= bp->b_bufsize; 2299 bufmallocspace -= bp->b_bufsize; 2300 runningbufspace -= bp->b_bufsize; 2301 if (bp->b_bufsize) 2302 bufspacewakeup(); 2303 bp->b_bufsize = 0; 2304 bp->b_flags &= ~B_MALLOC; 2305 newbsize = round_page(newbsize); 2306 } 2307 #endif 2308 vm_hold_load_pages( 2309 bp, 2310 (vm_offset_t) bp->b_data + bp->b_bufsize, 2311 (vm_offset_t) bp->b_data + newbsize); 2312 #if !defined(NO_B_MALLOC) 2313 if (origbuf) { 2314 bcopy(origbuf, bp->b_data, origbufsize); 2315 free(origbuf, M_BIOBUF); 2316 } 2317 #endif 2318 } 2319 } else { 2320 vm_page_t m; 2321 int desiredpages; 2322 2323 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 2324 desiredpages = (size == 0) ? 0 : 2325 num_pages((bp->b_offset & PAGE_MASK) + newbsize); 2326 2327 #if !defined(NO_B_MALLOC) 2328 if (bp->b_flags & B_MALLOC) 2329 panic("allocbuf: VMIO buffer can't be malloced"); 2330 #endif 2331 /* 2332 * Set B_CACHE initially if buffer is 0 length or will become 2333 * 0-length. 2334 */ 2335 if (size == 0 || bp->b_bufsize == 0) 2336 bp->b_flags |= B_CACHE; 2337 2338 if (newbsize < bp->b_bufsize) { 2339 /* 2340 * DEV_BSIZE aligned new buffer size is less then the 2341 * DEV_BSIZE aligned existing buffer size. Figure out 2342 * if we have to remove any pages. 2343 */ 2344 if (desiredpages < bp->b_npages) { 2345 for (i = desiredpages; i < bp->b_npages; i++) { 2346 /* 2347 * the page is not freed here -- it 2348 * is the responsibility of 2349 * vnode_pager_setsize 2350 */ 2351 m = bp->b_pages[i]; 2352 KASSERT(m != bogus_page, 2353 ("allocbuf: bogus page found")); 2354 while (vm_page_sleep_busy(m, TRUE, "biodep")) 2355 ; 2356 2357 bp->b_pages[i] = NULL; 2358 vm_page_unwire(m, 0); 2359 } 2360 pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + 2361 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); 2362 bp->b_npages = desiredpages; 2363 } 2364 } else if (size > bp->b_bcount) { 2365 /* 2366 * We are growing the buffer, possibly in a 2367 * byte-granular fashion. 2368 */ 2369 struct vnode *vp; 2370 vm_object_t obj; 2371 vm_offset_t toff; 2372 vm_offset_t tinc; 2373 2374 /* 2375 * Step 1, bring in the VM pages from the object, 2376 * allocating them if necessary. We must clear 2377 * B_CACHE if these pages are not valid for the 2378 * range covered by the buffer. 2379 */ 2380 2381 vp = bp->b_vp; 2382 obj = vp->v_object; 2383 2384 while (bp->b_npages < desiredpages) { 2385 vm_page_t m; 2386 vm_pindex_t pi; 2387 2388 pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; 2389 if ((m = vm_page_lookup(obj, pi)) == NULL) { 2390 m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL); 2391 if (m == NULL) { 2392 VM_WAIT; 2393 vm_pageout_deficit += desiredpages - bp->b_npages; 2394 } else { 2395 vm_page_wire(m); 2396 vm_page_wakeup(m); 2397 bp->b_flags &= ~B_CACHE; 2398 bp->b_pages[bp->b_npages] = m; 2399 ++bp->b_npages; 2400 } 2401 continue; 2402 } 2403 2404 /* 2405 * We found a page. If we have to sleep on it, 2406 * retry because it might have gotten freed out 2407 * from under us. 2408 * 2409 * We can only test PG_BUSY here. Blocking on 2410 * m->busy might lead to a deadlock: 2411 * 2412 * vm_fault->getpages->cluster_read->allocbuf 2413 * 2414 */ 2415 2416 if (vm_page_sleep_busy(m, FALSE, "pgtblk")) 2417 continue; 2418 2419 /* 2420 * We have a good page. Should we wakeup the 2421 * page daemon? 2422 */ 2423 if ((curproc != pageproc) && 2424 ((m->queue - m->pc) == PQ_CACHE) && 2425 ((cnt.v_free_count + cnt.v_cache_count) < 2426 (cnt.v_free_min + cnt.v_cache_min))) { 2427 pagedaemon_wakeup(); 2428 } 2429 vm_page_flag_clear(m, PG_ZERO); 2430 vm_page_wire(m); 2431 bp->b_pages[bp->b_npages] = m; 2432 ++bp->b_npages; 2433 } 2434 2435 /* 2436 * Step 2. We've loaded the pages into the buffer, 2437 * we have to figure out if we can still have B_CACHE 2438 * set. Note that B_CACHE is set according to the 2439 * byte-granular range ( bcount and size ), new the 2440 * aligned range ( newbsize ). 2441 * 2442 * The VM test is against m->valid, which is DEV_BSIZE 2443 * aligned. Needless to say, the validity of the data 2444 * needs to also be DEV_BSIZE aligned. Note that this 2445 * fails with NFS if the server or some other client 2446 * extends the file's EOF. If our buffer is resized, 2447 * B_CACHE may remain set! XXX 2448 */ 2449 2450 toff = bp->b_bcount; 2451 tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); 2452 2453 while ((bp->b_flags & B_CACHE) && toff < size) { 2454 vm_pindex_t pi; 2455 2456 if (tinc > (size - toff)) 2457 tinc = size - toff; 2458 2459 pi = ((bp->b_offset & PAGE_MASK) + toff) >> 2460 PAGE_SHIFT; 2461 2462 vfs_buf_test_cache( 2463 bp, 2464 bp->b_offset, 2465 toff, 2466 tinc, 2467 bp->b_pages[pi] 2468 ); 2469 toff += tinc; 2470 tinc = PAGE_SIZE; 2471 } 2472 2473 /* 2474 * Step 3, fixup the KVM pmap. Remember that 2475 * bp->b_data is relative to bp->b_offset, but 2476 * bp->b_offset may be offset into the first page. 2477 */ 2478 2479 bp->b_data = (caddr_t) 2480 trunc_page((vm_offset_t)bp->b_data); 2481 pmap_qenter( 2482 (vm_offset_t)bp->b_data, 2483 bp->b_pages, 2484 bp->b_npages 2485 ); 2486 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 2487 (vm_offset_t)(bp->b_offset & PAGE_MASK)); 2488 } 2489 } 2490 if (bp->b_flags & B_VMIO) 2491 vmiospace += (newbsize - bp->b_bufsize); 2492 bufspace += (newbsize - bp->b_bufsize); 2493 runningbufspace += (newbsize - bp->b_bufsize); 2494 if (newbsize < bp->b_bufsize) 2495 bufspacewakeup(); 2496 bp->b_bufsize = newbsize; /* actual buffer allocation */ 2497 bp->b_bcount = size; /* requested buffer size */ 2498 return 1; 2499 } 2500 2501 /* 2502 * biowait: 2503 * 2504 * Wait for buffer I/O completion, returning error status. The buffer 2505 * is left locked and B_DONE on return. B_EINTR is converted into a EINTR 2506 * error and cleared. 2507 */ 2508 int 2509 biowait(register struct buf * bp) 2510 { 2511 int s; 2512 2513 s = splbio(); 2514 while ((bp->b_flags & B_DONE) == 0) { 2515 #if defined(NO_SCHEDULE_MODS) 2516 tsleep(bp, PRIBIO, "biowait", 0); 2517 #else 2518 if (bp->b_flags & B_READ) 2519 tsleep(bp, PRIBIO, "biord", 0); 2520 else 2521 tsleep(bp, PRIBIO, "biowr", 0); 2522 #endif 2523 } 2524 splx(s); 2525 if (bp->b_flags & B_EINTR) { 2526 bp->b_flags &= ~B_EINTR; 2527 return (EINTR); 2528 } 2529 if (bp->b_flags & B_ERROR) { 2530 return (bp->b_error ? bp->b_error : EIO); 2531 } else { 2532 return (0); 2533 } 2534 } 2535 2536 /* 2537 * biodone: 2538 * 2539 * Finish I/O on a buffer, optionally calling a completion function. 2540 * This is usually called from an interrupt so process blocking is 2541 * not allowed. 2542 * 2543 * biodone is also responsible for setting B_CACHE in a B_VMIO bp. 2544 * In a non-VMIO bp, B_CACHE will be set on the next getblk() 2545 * assuming B_INVAL is clear. 2546 * 2547 * For the VMIO case, we set B_CACHE if the op was a read and no 2548 * read error occured, or if the op was a write. B_CACHE is never 2549 * set if the buffer is invalid or otherwise uncacheable. 2550 * 2551 * biodone does not mess with B_INVAL, allowing the I/O routine or the 2552 * initiator to leave B_INVAL set to brelse the buffer out of existance 2553 * in the biodone routine. 2554 */ 2555 void 2556 biodone(register struct buf * bp) 2557 { 2558 int s; 2559 2560 s = splbio(); 2561 2562 KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); 2563 KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); 2564 2565 bp->b_flags |= B_DONE; 2566 2567 if (bp->b_flags & B_FREEBUF) { 2568 brelse(bp); 2569 splx(s); 2570 return; 2571 } 2572 2573 if ((bp->b_flags & B_READ) == 0) { 2574 vwakeup(bp); 2575 } 2576 2577 /* call optional completion function if requested */ 2578 if (bp->b_flags & B_CALL) { 2579 bp->b_flags &= ~B_CALL; 2580 (*bp->b_iodone) (bp); 2581 splx(s); 2582 return; 2583 } 2584 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete) 2585 (*bioops.io_complete)(bp); 2586 2587 if (bp->b_flags & B_VMIO) { 2588 int i, resid; 2589 vm_ooffset_t foff; 2590 vm_page_t m; 2591 vm_object_t obj; 2592 int iosize; 2593 struct vnode *vp = bp->b_vp; 2594 2595 obj = vp->v_object; 2596 2597 #if defined(VFS_BIO_DEBUG) 2598 if (vp->v_usecount == 0) { 2599 panic("biodone: zero vnode ref count"); 2600 } 2601 2602 if (vp->v_object == NULL) { 2603 panic("biodone: missing VM object"); 2604 } 2605 2606 if ((vp->v_flag & VOBJBUF) == 0) { 2607 panic("biodone: vnode is not setup for merged cache"); 2608 } 2609 #endif 2610 2611 foff = bp->b_offset; 2612 KASSERT(bp->b_offset != NOOFFSET, 2613 ("biodone: no buffer offset")); 2614 2615 #if !defined(MAX_PERF) 2616 if (!obj) { 2617 panic("biodone: no object"); 2618 } 2619 #endif 2620 #if defined(VFS_BIO_DEBUG) 2621 if (obj->paging_in_progress < bp->b_npages) { 2622 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 2623 obj->paging_in_progress, bp->b_npages); 2624 } 2625 #endif 2626 2627 /* 2628 * Set B_CACHE if the op was a normal read and no error 2629 * occured. B_CACHE is set for writes in the b*write() 2630 * routines. 2631 */ 2632 iosize = bp->b_bcount; 2633 if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) { 2634 bp->b_flags |= B_CACHE; 2635 } 2636 2637 for (i = 0; i < bp->b_npages; i++) { 2638 int bogusflag = 0; 2639 m = bp->b_pages[i]; 2640 if (m == bogus_page) { 2641 bogusflag = 1; 2642 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 2643 if (!m) { 2644 #if defined(VFS_BIO_DEBUG) 2645 printf("biodone: page disappeared\n"); 2646 #endif 2647 vm_object_pip_subtract(obj, 1); 2648 bp->b_flags &= ~B_CACHE; 2649 continue; 2650 } 2651 bp->b_pages[i] = m; 2652 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 2653 } 2654 #if defined(VFS_BIO_DEBUG) 2655 if (OFF_TO_IDX(foff) != m->pindex) { 2656 printf( 2657 "biodone: foff(%lu)/m->pindex(%d) mismatch\n", 2658 (unsigned long)foff, m->pindex); 2659 } 2660 #endif 2661 resid = IDX_TO_OFF(m->pindex + 1) - foff; 2662 if (resid > iosize) 2663 resid = iosize; 2664 2665 /* 2666 * In the write case, the valid and clean bits are 2667 * already changed correctly ( see bdwrite() ), so we 2668 * only need to do this here in the read case. 2669 */ 2670 if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { 2671 vfs_page_set_valid(bp, foff, i, m); 2672 } 2673 vm_page_flag_clear(m, PG_ZERO); 2674 2675 /* 2676 * when debugging new filesystems or buffer I/O methods, this 2677 * is the most common error that pops up. if you see this, you 2678 * have not set the page busy flag correctly!!! 2679 */ 2680 if (m->busy == 0) { 2681 #if !defined(MAX_PERF) 2682 printf("biodone: page busy < 0, " 2683 "pindex: %d, foff: 0x(%x,%x), " 2684 "resid: %d, index: %d\n", 2685 (int) m->pindex, (int)(foff >> 32), 2686 (int) foff & 0xffffffff, resid, i); 2687 #endif 2688 if (vp->v_type != VBLK) 2689 #if !defined(MAX_PERF) 2690 printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n", 2691 bp->b_vp->v_mount->mnt_stat.f_iosize, 2692 (int) bp->b_lblkno, 2693 bp->b_flags, bp->b_npages); 2694 else 2695 printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", 2696 (int) bp->b_lblkno, 2697 bp->b_flags, bp->b_npages); 2698 printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", 2699 m->valid, m->dirty, m->wire_count); 2700 #endif 2701 panic("biodone: page busy < 0\n"); 2702 } 2703 vm_page_io_finish(m); 2704 vm_object_pip_subtract(obj, 1); 2705 foff += resid; 2706 iosize -= resid; 2707 } 2708 if (obj) 2709 vm_object_pip_wakeupn(obj, 0); 2710 } 2711 /* 2712 * For asynchronous completions, release the buffer now. The brelse 2713 * will do a wakeup there if necessary - so no need to do a wakeup 2714 * here in the async case. The sync case always needs to do a wakeup. 2715 */ 2716 2717 if (bp->b_flags & B_ASYNC) { 2718 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) 2719 brelse(bp); 2720 else 2721 bqrelse(bp); 2722 } else { 2723 wakeup(bp); 2724 } 2725 splx(s); 2726 } 2727 2728 /* 2729 * This routine is called in lieu of iodone in the case of 2730 * incomplete I/O. This keeps the busy status for pages 2731 * consistant. 2732 */ 2733 void 2734 vfs_unbusy_pages(struct buf * bp) 2735 { 2736 int i; 2737 2738 if (bp->b_flags & B_VMIO) { 2739 struct vnode *vp = bp->b_vp; 2740 vm_object_t obj = vp->v_object; 2741 2742 for (i = 0; i < bp->b_npages; i++) { 2743 vm_page_t m = bp->b_pages[i]; 2744 2745 if (m == bogus_page) { 2746 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); 2747 #if !defined(MAX_PERF) 2748 if (!m) { 2749 panic("vfs_unbusy_pages: page missing\n"); 2750 } 2751 #endif 2752 bp->b_pages[i] = m; 2753 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 2754 } 2755 vm_object_pip_subtract(obj, 1); 2756 vm_page_flag_clear(m, PG_ZERO); 2757 vm_page_io_finish(m); 2758 } 2759 vm_object_pip_wakeupn(obj, 0); 2760 } 2761 } 2762 2763 /* 2764 * vfs_page_set_valid: 2765 * 2766 * Set the valid bits in a page based on the supplied offset. The 2767 * range is restricted to the buffer's size. 2768 * 2769 * This routine is typically called after a read completes. 2770 */ 2771 static void 2772 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) 2773 { 2774 vm_ooffset_t soff, eoff; 2775 2776 /* 2777 * Start and end offsets in buffer. eoff - soff may not cross a 2778 * page boundry or cross the end of the buffer. The end of the 2779 * buffer, in this case, is our file EOF, not the allocation size 2780 * of the buffer. 2781 */ 2782 soff = off; 2783 eoff = (off + PAGE_SIZE) & ~PAGE_MASK; 2784 if (eoff > bp->b_offset + bp->b_bcount) 2785 eoff = bp->b_offset + bp->b_bcount; 2786 2787 /* 2788 * Set valid range. This is typically the entire buffer and thus the 2789 * entire page. 2790 */ 2791 if (eoff > soff) { 2792 vm_page_set_validclean( 2793 m, 2794 (vm_offset_t) (soff & PAGE_MASK), 2795 (vm_offset_t) (eoff - soff) 2796 ); 2797 } 2798 } 2799 2800 /* 2801 * This routine is called before a device strategy routine. 2802 * It is used to tell the VM system that paging I/O is in 2803 * progress, and treat the pages associated with the buffer 2804 * almost as being PG_BUSY. Also the object paging_in_progress 2805 * flag is handled to make sure that the object doesn't become 2806 * inconsistant. 2807 * 2808 * Since I/O has not been initiated yet, certain buffer flags 2809 * such as B_ERROR or B_INVAL may be in an inconsistant state 2810 * and should be ignored. 2811 */ 2812 void 2813 vfs_busy_pages(struct buf * bp, int clear_modify) 2814 { 2815 int i, bogus; 2816 2817 if (bp->b_flags & B_VMIO) { 2818 struct vnode *vp = bp->b_vp; 2819 vm_object_t obj = vp->v_object; 2820 vm_ooffset_t foff; 2821 2822 foff = bp->b_offset; 2823 KASSERT(bp->b_offset != NOOFFSET, 2824 ("vfs_busy_pages: no buffer offset")); 2825 vfs_setdirty(bp); 2826 2827 retry: 2828 for (i = 0; i < bp->b_npages; i++) { 2829 vm_page_t m = bp->b_pages[i]; 2830 if (vm_page_sleep_busy(m, FALSE, "vbpage")) 2831 goto retry; 2832 } 2833 2834 bogus = 0; 2835 for (i = 0; i < bp->b_npages; i++) { 2836 vm_page_t m = bp->b_pages[i]; 2837 2838 vm_page_flag_clear(m, PG_ZERO); 2839 if ((bp->b_flags & B_CLUSTER) == 0) { 2840 vm_object_pip_add(obj, 1); 2841 vm_page_io_start(m); 2842 } 2843 2844 /* 2845 * When readying a buffer for a read ( i.e 2846 * clear_modify == 0 ), it is important to do 2847 * bogus_page replacement for valid pages in 2848 * partially instantiated buffers. Partially 2849 * instantiated buffers can, in turn, occur when 2850 * reconstituting a buffer from its VM backing store 2851 * base. We only have to do this if B_CACHE is 2852 * clear ( which causes the I/O to occur in the 2853 * first place ). The replacement prevents the read 2854 * I/O from overwriting potentially dirty VM-backed 2855 * pages. XXX bogus page replacement is, uh, bogus. 2856 * It may not work properly with small-block devices. 2857 * We need to find a better way. 2858 */ 2859 2860 vm_page_protect(m, VM_PROT_NONE); 2861 if (clear_modify) 2862 vfs_page_set_valid(bp, foff, i, m); 2863 else if (m->valid == VM_PAGE_BITS_ALL && 2864 (bp->b_flags & B_CACHE) == 0) { 2865 bp->b_pages[i] = bogus_page; 2866 bogus++; 2867 } 2868 foff = (foff + PAGE_SIZE) & ~PAGE_MASK; 2869 } 2870 if (bogus) 2871 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 2872 } 2873 } 2874 2875 /* 2876 * Tell the VM system that the pages associated with this buffer 2877 * are clean. This is used for delayed writes where the data is 2878 * going to go to disk eventually without additional VM intevention. 2879 * 2880 * Note that while we only really need to clean through to b_bcount, we 2881 * just go ahead and clean through to b_bufsize. 2882 */ 2883 static void 2884 vfs_clean_pages(struct buf * bp) 2885 { 2886 int i; 2887 2888 if (bp->b_flags & B_VMIO) { 2889 vm_ooffset_t foff; 2890 2891 foff = bp->b_offset; 2892 KASSERT(bp->b_offset != NOOFFSET, 2893 ("vfs_clean_pages: no buffer offset")); 2894 for (i = 0; i < bp->b_npages; i++) { 2895 vm_page_t m = bp->b_pages[i]; 2896 vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK; 2897 vm_ooffset_t eoff = noff; 2898 2899 if (eoff > bp->b_offset + bp->b_bufsize) 2900 eoff = bp->b_offset + bp->b_bufsize; 2901 vfs_page_set_valid(bp, foff, i, m); 2902 /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ 2903 foff = noff; 2904 } 2905 } 2906 } 2907 2908 /* 2909 * vfs_bio_set_validclean: 2910 * 2911 * Set the range within the buffer to valid and clean. The range is 2912 * relative to the beginning of the buffer, b_offset. Note that b_offset 2913 * itself may be offset from the beginning of the first page. 2914 */ 2915 2916 void 2917 vfs_bio_set_validclean(struct buf *bp, int base, int size) 2918 { 2919 if (bp->b_flags & B_VMIO) { 2920 int i; 2921 int n; 2922 2923 /* 2924 * Fixup base to be relative to beginning of first page. 2925 * Set initial n to be the maximum number of bytes in the 2926 * first page that can be validated. 2927 */ 2928 2929 base += (bp->b_offset & PAGE_MASK); 2930 n = PAGE_SIZE - (base & PAGE_MASK); 2931 2932 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { 2933 vm_page_t m = bp->b_pages[i]; 2934 2935 if (n > size) 2936 n = size; 2937 2938 vm_page_set_validclean(m, base & PAGE_MASK, n); 2939 base += n; 2940 size -= n; 2941 n = PAGE_SIZE; 2942 } 2943 } 2944 } 2945 2946 /* 2947 * vfs_bio_clrbuf: 2948 * 2949 * clear a buffer. This routine essentially fakes an I/O, so we need 2950 * to clear B_ERROR and B_INVAL. 2951 * 2952 * Note that while we only theoretically need to clear through b_bcount, 2953 * we go ahead and clear through b_bufsize. 2954 */ 2955 2956 void 2957 vfs_bio_clrbuf(struct buf *bp) { 2958 int i, mask = 0; 2959 caddr_t sa, ea; 2960 if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { 2961 bp->b_flags &= ~(B_INVAL|B_ERROR); 2962 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 2963 (bp->b_offset & PAGE_MASK) == 0) { 2964 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 2965 if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && 2966 ((bp->b_pages[0]->valid & mask) != mask)) { 2967 bzero(bp->b_data, bp->b_bufsize); 2968 } 2969 bp->b_pages[0]->valid |= mask; 2970 bp->b_resid = 0; 2971 return; 2972 } 2973 ea = sa = bp->b_data; 2974 for(i=0;i<bp->b_npages;i++,sa=ea) { 2975 int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; 2976 ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); 2977 ea = (caddr_t)(vm_offset_t)ulmin( 2978 (u_long)(vm_offset_t)ea, 2979 (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); 2980 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 2981 if ((bp->b_pages[i]->valid & mask) == mask) 2982 continue; 2983 if ((bp->b_pages[i]->valid & mask) == 0) { 2984 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { 2985 bzero(sa, ea - sa); 2986 } 2987 } else { 2988 for (; sa < ea; sa += DEV_BSIZE, j++) { 2989 if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && 2990 (bp->b_pages[i]->valid & (1<<j)) == 0) 2991 bzero(sa, DEV_BSIZE); 2992 } 2993 } 2994 bp->b_pages[i]->valid |= mask; 2995 vm_page_flag_clear(bp->b_pages[i], PG_ZERO); 2996 } 2997 bp->b_resid = 0; 2998 } else { 2999 clrbuf(bp); 3000 } 3001 } 3002 3003 /* 3004 * vm_hold_load_pages and vm_hold_unload pages get pages into 3005 * a buffers address space. The pages are anonymous and are 3006 * not associated with a file object. 3007 */ 3008 void 3009 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 3010 { 3011 vm_offset_t pg; 3012 vm_page_t p; 3013 int index; 3014 3015 to = round_page(to); 3016 from = round_page(from); 3017 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 3018 3019 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 3020 3021 tryagain: 3022 3023 p = vm_page_alloc(kernel_object, 3024 ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 3025 VM_ALLOC_NORMAL); 3026 if (!p) { 3027 vm_pageout_deficit += (to - from) >> PAGE_SHIFT; 3028 VM_WAIT; 3029 goto tryagain; 3030 } 3031 vm_page_wire(p); 3032 p->valid = VM_PAGE_BITS_ALL; 3033 vm_page_flag_clear(p, PG_ZERO); 3034 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 3035 bp->b_pages[index] = p; 3036 vm_page_wakeup(p); 3037 } 3038 bp->b_npages = index; 3039 } 3040 3041 void 3042 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 3043 { 3044 vm_offset_t pg; 3045 vm_page_t p; 3046 int index, newnpages; 3047 3048 from = round_page(from); 3049 to = round_page(to); 3050 newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 3051 3052 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 3053 p = bp->b_pages[index]; 3054 if (p && (index < bp->b_npages)) { 3055 #if !defined(MAX_PERF) 3056 if (p->busy) { 3057 printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", 3058 bp->b_blkno, bp->b_lblkno); 3059 } 3060 #endif 3061 bp->b_pages[index] = NULL; 3062 pmap_kremove(pg); 3063 vm_page_busy(p); 3064 vm_page_unwire(p, 0); 3065 vm_page_free(p); 3066 } 3067 } 3068 bp->b_npages = newnpages; 3069 } 3070 3071 3072 #include "opt_ddb.h" 3073 #ifdef DDB 3074 #include <ddb/ddb.h> 3075 3076 DB_SHOW_COMMAND(buffer, db_show_buffer) 3077 { 3078 /* get args */ 3079 struct buf *bp = (struct buf *)addr; 3080 3081 if (!have_addr) { 3082 db_printf("usage: show buffer <addr>\n"); 3083 return; 3084 } 3085 3086 db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); 3087 db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, " 3088 "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, " 3089 "b_blkno = %d, b_pblkno = %d\n", 3090 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 3091 major(bp->b_dev), minor(bp->b_dev), 3092 bp->b_data, bp->b_blkno, bp->b_pblkno); 3093 if (bp->b_npages) { 3094 int i; 3095 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); 3096 for (i = 0; i < bp->b_npages; i++) { 3097 vm_page_t m; 3098 m = bp->b_pages[i]; 3099 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, 3100 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); 3101 if ((i + 1) < bp->b_npages) 3102 db_printf(","); 3103 } 3104 db_printf("\n"); 3105 } 3106 } 3107 #endif /* DDB */ 3108