1 /* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. This work was done expressly for inclusion into FreeBSD. Other use 17 * is allowed if this notation is included. 18 * 5. Modifications may be freely made to this file if the above conditions 19 * are met. 20 * 21 * $Id: vfs_bio.c,v 1.127 1997/09/21 04:49:30 dyson Exp $ 22 */ 23 24 /* 25 * this file contains a new buffer I/O scheme implementing a coherent 26 * VM object and buffer cache scheme. Pains have been taken to make 27 * sure that the performance degradation associated with schemes such 28 * as this is not realized. 29 * 30 * Author: John S. Dyson 31 * Significant help during the development and debugging phases 32 * had been provided by David Greenman, also of the FreeBSD core team. 33 */ 34 35 #include "opt_bounce.h" 36 37 #define VMIO 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/sysproto.h> 41 #include <sys/kernel.h> 42 #include <sys/sysctl.h> 43 #include <sys/proc.h> 44 #include <sys/vnode.h> 45 #include <sys/vmmeter.h> 46 #include <vm/vm.h> 47 #include <vm/vm_param.h> 48 #include <vm/vm_prot.h> 49 #include <vm/vm_kern.h> 50 #include <vm/vm_pageout.h> 51 #include <vm/vm_page.h> 52 #include <vm/vm_object.h> 53 #include <vm/vm_extern.h> 54 #include <vm/vm_map.h> 55 #include <sys/buf.h> 56 #include <sys/mount.h> 57 #include <sys/malloc.h> 58 #include <sys/resourcevar.h> 59 #include <sys/proc.h> 60 61 #include <miscfs/specfs/specdev.h> 62 63 static void vfs_update __P((void)); 64 static struct proc *updateproc; 65 static struct kproc_desc up_kp = { 66 "update", 67 vfs_update, 68 &updateproc 69 }; 70 SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 71 72 struct buf *buf; /* buffer header pool */ 73 struct swqueue bswlist; 74 75 int count_lock_queue __P((void)); 76 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, 77 vm_offset_t to); 78 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, 79 vm_offset_t to); 80 static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff, 81 vm_offset_t off, vm_offset_t size, 82 vm_page_t m); 83 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, 84 int pageno, vm_page_t m); 85 static void vfs_clean_pages(struct buf * bp); 86 static void vfs_setdirty(struct buf *bp); 87 static void vfs_vmio_release(struct buf *bp); 88 static void flushdirtybuffers(int slpflag, int slptimeo); 89 90 int needsbuffer; 91 92 /* 93 * Internal update daemon, process 3 94 * The variable vfs_update_wakeup allows for internal syncs. 95 */ 96 int vfs_update_wakeup; 97 98 99 /* 100 * buffers base kva 101 */ 102 103 /* 104 * bogus page -- for I/O to/from partially complete buffers 105 * this is a temporary solution to the problem, but it is not 106 * really that bad. it would be better to split the buffer 107 * for input in the case of buffers partially already in memory, 108 * but the code is intricate enough already. 109 */ 110 vm_page_t bogus_page; 111 static vm_offset_t bogus_offset; 112 113 static int bufspace, maxbufspace, vmiospace, maxvmiobufspace, 114 bufmallocspace, maxbufmallocspace; 115 int numdirtybuffers, lodirtybuffers, hidirtybuffers; 116 static int numfreebuffers, lofreebuffers, hifreebuffers; 117 118 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, 119 &numdirtybuffers, 0, ""); 120 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, 121 &lodirtybuffers, 0, ""); 122 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, 123 &hidirtybuffers, 0, ""); 124 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, 125 &numfreebuffers, 0, ""); 126 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, 127 &lofreebuffers, 0, ""); 128 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, 129 &hifreebuffers, 0, ""); 130 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, 131 &maxbufspace, 0, ""); 132 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, 133 &bufspace, 0, ""); 134 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW, 135 &maxvmiobufspace, 0, ""); 136 SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD, 137 &vmiospace, 0, ""); 138 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, 139 &maxbufmallocspace, 0, ""); 140 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, 141 &bufmallocspace, 0, ""); 142 143 static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash; 144 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; 145 146 extern int vm_swap_size; 147 148 #define BUF_MAXUSE 24 149 150 #define VFS_BIO_NEED_ANY 1 151 #define VFS_BIO_NEED_LOWLIMIT 2 152 #define VFS_BIO_NEED_FREE 4 153 154 /* 155 * Initialize buffer headers and related structures. 156 */ 157 void 158 bufinit() 159 { 160 struct buf *bp; 161 int i; 162 163 TAILQ_INIT(&bswlist); 164 LIST_INIT(&invalhash); 165 166 /* first, make a null hash table */ 167 for (i = 0; i < BUFHSZ; i++) 168 LIST_INIT(&bufhashtbl[i]); 169 170 /* next, make a null set of free lists */ 171 for (i = 0; i < BUFFER_QUEUES; i++) 172 TAILQ_INIT(&bufqueues[i]); 173 174 /* finally, initialize each buffer header and stick on empty q */ 175 for (i = 0; i < nbuf; i++) { 176 bp = &buf[i]; 177 bzero(bp, sizeof *bp); 178 bp->b_flags = B_INVAL; /* we're just an empty header */ 179 bp->b_dev = NODEV; 180 bp->b_rcred = NOCRED; 181 bp->b_wcred = NOCRED; 182 bp->b_qindex = QUEUE_EMPTY; 183 bp->b_vnbufs.le_next = NOLIST; 184 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 185 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 186 } 187 /* 188 * maxbufspace is currently calculated to support all filesystem blocks 189 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer 190 * cache is still the same as it would be for 8K filesystems. This 191 * keeps the size of the buffer cache "in check" for big block filesystems. 192 */ 193 maxbufspace = (nbuf + 8) * DFLTBSIZE; 194 /* 195 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed 196 */ 197 maxvmiobufspace = 2 * maxbufspace / 3; 198 /* 199 * Limit the amount of malloc memory since it is wired permanently into 200 * the kernel space. Even though this is accounted for in the buffer 201 * allocation, we don't want the malloced region to grow uncontrolled. 202 * The malloc scheme improves memory utilization significantly on average 203 * (small) directories. 204 */ 205 maxbufmallocspace = maxbufspace / 20; 206 207 /* 208 * Remove the probability of deadlock conditions by limiting the 209 * number of dirty buffers. 210 */ 211 hidirtybuffers = nbuf / 6 + 20; 212 lodirtybuffers = nbuf / 12 + 10; 213 numdirtybuffers = 0; 214 lofreebuffers = nbuf / 18 + 5; 215 hifreebuffers = 2 * lofreebuffers; 216 numfreebuffers = nbuf; 217 218 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 219 bogus_page = vm_page_alloc(kernel_object, 220 ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 221 VM_ALLOC_NORMAL); 222 223 } 224 225 /* 226 * Free the kva allocation for a buffer 227 * Must be called only at splbio or higher, 228 * as this is the only locking for buffer_map. 229 */ 230 static void 231 bfreekva(struct buf * bp) 232 { 233 if (bp->b_kvasize == 0) 234 return; 235 236 vm_map_delete(buffer_map, 237 (vm_offset_t) bp->b_kvabase, 238 (vm_offset_t) bp->b_kvabase + bp->b_kvasize); 239 240 bp->b_kvasize = 0; 241 242 } 243 244 /* 245 * remove the buffer from the appropriate free list 246 */ 247 void 248 bremfree(struct buf * bp) 249 { 250 int s = splbio(); 251 252 if (bp->b_qindex != QUEUE_NONE) { 253 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 254 bp->b_qindex = QUEUE_NONE; 255 } else { 256 #if !defined(MAX_PERF) 257 panic("bremfree: removing a buffer when not on a queue"); 258 #endif 259 } 260 if ((bp->b_flags & B_INVAL) || 261 (bp->b_flags & (B_DELWRI|B_LOCKED)) == 0) 262 --numfreebuffers; 263 splx(s); 264 } 265 266 /* 267 * Get a buffer with the specified data. Look in the cache first. 268 */ 269 int 270 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, 271 struct buf ** bpp) 272 { 273 struct buf *bp; 274 275 bp = getblk(vp, blkno, size, 0, 0); 276 *bpp = bp; 277 278 /* if not found in cache, do some I/O */ 279 if ((bp->b_flags & B_CACHE) == 0) { 280 if (curproc != NULL) 281 curproc->p_stats->p_ru.ru_inblock++; 282 bp->b_flags |= B_READ; 283 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 284 if (bp->b_rcred == NOCRED) { 285 if (cred != NOCRED) 286 crhold(cred); 287 bp->b_rcred = cred; 288 } 289 vfs_busy_pages(bp, 0); 290 VOP_STRATEGY(bp); 291 return (biowait(bp)); 292 } 293 return (0); 294 } 295 296 /* 297 * Operates like bread, but also starts asynchronous I/O on 298 * read-ahead blocks. 299 */ 300 int 301 breadn(struct vnode * vp, daddr_t blkno, int size, 302 daddr_t * rablkno, int *rabsize, 303 int cnt, struct ucred * cred, struct buf ** bpp) 304 { 305 struct buf *bp, *rabp; 306 int i; 307 int rv = 0, readwait = 0; 308 309 *bpp = bp = getblk(vp, blkno, size, 0, 0); 310 311 /* if not found in cache, do some I/O */ 312 if ((bp->b_flags & B_CACHE) == 0) { 313 if (curproc != NULL) 314 curproc->p_stats->p_ru.ru_inblock++; 315 bp->b_flags |= B_READ; 316 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 317 if (bp->b_rcred == NOCRED) { 318 if (cred != NOCRED) 319 crhold(cred); 320 bp->b_rcred = cred; 321 } 322 vfs_busy_pages(bp, 0); 323 VOP_STRATEGY(bp); 324 ++readwait; 325 } 326 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 327 if (inmem(vp, *rablkno)) 328 continue; 329 rabp = getblk(vp, *rablkno, *rabsize, 0, 0); 330 331 if ((rabp->b_flags & B_CACHE) == 0) { 332 if (curproc != NULL) 333 curproc->p_stats->p_ru.ru_inblock++; 334 rabp->b_flags |= B_READ | B_ASYNC; 335 rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 336 if (rabp->b_rcred == NOCRED) { 337 if (cred != NOCRED) 338 crhold(cred); 339 rabp->b_rcred = cred; 340 } 341 vfs_busy_pages(rabp, 0); 342 VOP_STRATEGY(rabp); 343 } else { 344 brelse(rabp); 345 } 346 } 347 348 if (readwait) { 349 rv = biowait(bp); 350 } 351 return (rv); 352 } 353 354 /* 355 * Write, release buffer on completion. (Done by iodone 356 * if async.) 357 */ 358 int 359 bwrite(struct buf * bp) 360 { 361 int oldflags = bp->b_flags; 362 363 if (bp->b_flags & B_INVAL) { 364 brelse(bp); 365 return (0); 366 } 367 #if !defined(MAX_PERF) 368 if (!(bp->b_flags & B_BUSY)) 369 panic("bwrite: buffer is not busy???"); 370 #endif 371 372 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 373 bp->b_flags |= B_WRITEINPROG; 374 375 if ((oldflags & B_DELWRI) == B_DELWRI) { 376 --numdirtybuffers; 377 reassignbuf(bp, bp->b_vp); 378 } 379 380 bp->b_vp->v_numoutput++; 381 vfs_busy_pages(bp, 1); 382 if (curproc != NULL) 383 curproc->p_stats->p_ru.ru_oublock++; 384 VOP_STRATEGY(bp); 385 386 if ((oldflags & B_ASYNC) == 0) { 387 int rtval = biowait(bp); 388 389 if (oldflags & B_DELWRI) { 390 reassignbuf(bp, bp->b_vp); 391 } 392 brelse(bp); 393 return (rtval); 394 } 395 return (0); 396 } 397 398 int 399 vn_bwrite(ap) 400 struct vop_bwrite_args *ap; 401 { 402 return (bwrite(ap->a_bp)); 403 } 404 405 void 406 vfs_bio_need_satisfy(void) { 407 ++numfreebuffers; 408 if (!needsbuffer) 409 return; 410 if (numdirtybuffers < lodirtybuffers) { 411 needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT); 412 } else { 413 needsbuffer &= ~VFS_BIO_NEED_ANY; 414 } 415 if (numfreebuffers >= hifreebuffers) { 416 needsbuffer &= ~VFS_BIO_NEED_FREE; 417 } 418 wakeup(&needsbuffer); 419 } 420 421 /* 422 * Delayed write. (Buffer is marked dirty). 423 */ 424 void 425 bdwrite(struct buf * bp) 426 { 427 428 #if !defined(MAX_PERF) 429 if ((bp->b_flags & B_BUSY) == 0) { 430 panic("bdwrite: buffer is not busy"); 431 } 432 #endif 433 434 if (bp->b_flags & B_INVAL) { 435 brelse(bp); 436 return; 437 } 438 if (bp->b_flags & B_TAPE) { 439 bawrite(bp); 440 return; 441 } 442 bp->b_flags &= ~(B_READ|B_RELBUF); 443 if ((bp->b_flags & B_DELWRI) == 0) { 444 bp->b_flags |= B_DONE | B_DELWRI; 445 reassignbuf(bp, bp->b_vp); 446 ++numdirtybuffers; 447 } 448 449 /* 450 * This bmap keeps the system from needing to do the bmap later, 451 * perhaps when the system is attempting to do a sync. Since it 452 * is likely that the indirect block -- or whatever other datastructure 453 * that the filesystem needs is still in memory now, it is a good 454 * thing to do this. Note also, that if the pageout daemon is 455 * requesting a sync -- there might not be enough memory to do 456 * the bmap then... So, this is important to do. 457 */ 458 if (bp->b_lblkno == bp->b_blkno) { 459 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 460 } 461 462 /* 463 * Set the *dirty* buffer range based upon the VM system dirty pages. 464 */ 465 vfs_setdirty(bp); 466 467 /* 468 * We need to do this here to satisfy the vnode_pager and the 469 * pageout daemon, so that it thinks that the pages have been 470 * "cleaned". Note that since the pages are in a delayed write 471 * buffer -- the VFS layer "will" see that the pages get written 472 * out on the next sync, or perhaps the cluster will be completed. 473 */ 474 vfs_clean_pages(bp); 475 bqrelse(bp); 476 477 if (numdirtybuffers >= hidirtybuffers) 478 flushdirtybuffers(0, 0); 479 480 return; 481 } 482 483 /* 484 * Asynchronous write. 485 * Start output on a buffer, but do not wait for it to complete. 486 * The buffer is released when the output completes. 487 */ 488 void 489 bawrite(struct buf * bp) 490 { 491 bp->b_flags |= B_ASYNC; 492 (void) VOP_BWRITE(bp); 493 } 494 495 /* 496 * Ordered write. 497 * Start output on a buffer, but only wait for it to complete if the 498 * output device cannot guarantee ordering in some other way. Devices 499 * that can perform asynchronous ordered writes will set the B_ASYNC 500 * flag in their strategy routine. 501 * The buffer is released when the output completes. 502 */ 503 int 504 bowrite(struct buf * bp) 505 { 506 /* 507 * XXX Add in B_ASYNC once the SCSI 508 * layer can deal with ordered 509 * writes properly. 510 */ 511 bp->b_flags |= B_ORDERED; 512 return (VOP_BWRITE(bp)); 513 } 514 515 /* 516 * Release a buffer. 517 */ 518 void 519 brelse(struct buf * bp) 520 { 521 int s; 522 523 if (bp->b_flags & B_CLUSTER) { 524 relpbuf(bp); 525 return; 526 } 527 /* anyone need a "free" block? */ 528 s = splbio(); 529 530 /* anyone need this block? */ 531 if (bp->b_flags & B_WANTED) { 532 bp->b_flags &= ~(B_WANTED | B_AGE); 533 wakeup(bp); 534 } 535 536 if (bp->b_flags & B_LOCKED) 537 bp->b_flags &= ~B_ERROR; 538 539 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || 540 (bp->b_bufsize <= 0)) { 541 bp->b_flags |= B_INVAL; 542 if (bp->b_flags & B_DELWRI) 543 --numdirtybuffers; 544 bp->b_flags &= ~(B_DELWRI | B_CACHE); 545 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) { 546 if (bp->b_bufsize) 547 allocbuf(bp, 0); 548 brelvp(bp); 549 } 550 } 551 552 /* 553 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 554 * constituted, so the B_INVAL flag is used to *invalidate* the buffer, 555 * but the VM object is kept around. The B_NOCACHE flag is used to 556 * invalidate the pages in the VM object. 557 * 558 * If the buffer is a partially filled NFS buffer, keep it 559 * since invalidating it now will lose informatio. The valid 560 * flags in the vm_pages have only DEV_BSIZE resolution but 561 * the b_validoff, b_validend fields have byte resolution. 562 * This can avoid unnecessary re-reads of the buffer. 563 * XXX this seems to cause performance problems. 564 */ 565 if ((bp->b_flags & B_VMIO) 566 && !(bp->b_vp->v_tag == VT_NFS && 567 bp->b_vp->v_type != VBLK && 568 (bp->b_flags & B_DELWRI) != 0) 569 #ifdef notdef 570 && (bp->b_vp->v_tag != VT_NFS 571 || bp->b_vp->v_type == VBLK 572 || (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) 573 || bp->b_validend == 0 574 || (bp->b_validoff == 0 575 && bp->b_validend == bp->b_bufsize)) 576 #endif 577 ) { 578 vm_ooffset_t foff; 579 vm_object_t obj; 580 int i, resid; 581 vm_page_t m; 582 struct vnode *vp; 583 int iototal = bp->b_bufsize; 584 585 vp = bp->b_vp; 586 587 #if !defined(MAX_PERF) 588 if (!vp) 589 panic("brelse: missing vp"); 590 #endif 591 592 if (bp->b_npages) { 593 vm_pindex_t poff; 594 obj = (vm_object_t) vp->v_object; 595 if (vp->v_type == VBLK) 596 foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT; 597 else 598 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 599 poff = OFF_TO_IDX(foff); 600 for (i = 0; i < bp->b_npages; i++) { 601 m = bp->b_pages[i]; 602 if (m == bogus_page) { 603 m = vm_page_lookup(obj, poff + i); 604 #if !defined(MAX_PERF) 605 if (!m) { 606 panic("brelse: page missing\n"); 607 } 608 #endif 609 bp->b_pages[i] = m; 610 pmap_qenter(trunc_page(bp->b_data), 611 bp->b_pages, bp->b_npages); 612 } 613 resid = IDX_TO_OFF(m->pindex+1) - foff; 614 if (resid > iototal) 615 resid = iototal; 616 if (resid > 0) { 617 /* 618 * Don't invalidate the page if the local machine has already 619 * modified it. This is the lesser of two evils, and should 620 * be fixed. 621 */ 622 if (bp->b_flags & (B_NOCACHE | B_ERROR)) { 623 vm_page_test_dirty(m); 624 if (m->dirty == 0) { 625 vm_page_set_invalid(m, (vm_offset_t) foff, resid); 626 if (m->valid == 0) 627 vm_page_protect(m, VM_PROT_NONE); 628 } 629 } 630 if (resid >= PAGE_SIZE) { 631 if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { 632 bp->b_flags |= B_INVAL; 633 } 634 } else { 635 if (!vm_page_is_valid(m, 636 (((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) { 637 bp->b_flags |= B_INVAL; 638 } 639 } 640 } 641 foff += resid; 642 iototal -= resid; 643 } 644 } 645 if (bp->b_flags & (B_INVAL | B_RELBUF)) 646 vfs_vmio_release(bp); 647 } 648 #if !defined(MAX_PERF) 649 if (bp->b_qindex != QUEUE_NONE) 650 panic("brelse: free buffer onto another queue???"); 651 #endif 652 653 /* enqueue */ 654 /* buffers with no memory */ 655 if (bp->b_bufsize == 0) { 656 bp->b_flags |= B_INVAL; 657 bp->b_qindex = QUEUE_EMPTY; 658 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 659 LIST_REMOVE(bp, b_hash); 660 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 661 bp->b_dev = NODEV; 662 /* 663 * Get rid of the kva allocation *now* 664 */ 665 bfreekva(bp); 666 667 /* buffers with junk contents */ 668 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { 669 bp->b_flags |= B_INVAL; 670 bp->b_qindex = QUEUE_AGE; 671 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 672 LIST_REMOVE(bp, b_hash); 673 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 674 bp->b_dev = NODEV; 675 676 /* buffers that are locked */ 677 } else if (bp->b_flags & B_LOCKED) { 678 bp->b_qindex = QUEUE_LOCKED; 679 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 680 681 /* buffers with stale but valid contents */ 682 } else if (bp->b_flags & B_AGE) { 683 bp->b_qindex = QUEUE_AGE; 684 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 685 686 /* buffers with valid and quite potentially reuseable contents */ 687 } else { 688 bp->b_qindex = QUEUE_LRU; 689 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 690 } 691 692 if ((bp->b_flags & B_INVAL) || 693 (bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) { 694 if (bp->b_flags & B_DELWRI) { 695 --numdirtybuffers; 696 bp->b_flags &= ~B_DELWRI; 697 } 698 vfs_bio_need_satisfy(); 699 } 700 701 /* unlock */ 702 bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | 703 B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 704 splx(s); 705 } 706 707 /* 708 * Release a buffer. 709 */ 710 void 711 bqrelse(struct buf * bp) 712 { 713 int s; 714 715 s = splbio(); 716 717 /* anyone need this block? */ 718 if (bp->b_flags & B_WANTED) { 719 bp->b_flags &= ~(B_WANTED | B_AGE); 720 wakeup(bp); 721 } 722 723 #if !defined(MAX_PERF) 724 if (bp->b_qindex != QUEUE_NONE) 725 panic("bqrelse: free buffer onto another queue???"); 726 #endif 727 728 if (bp->b_flags & B_LOCKED) { 729 bp->b_flags &= ~B_ERROR; 730 bp->b_qindex = QUEUE_LOCKED; 731 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 732 /* buffers with stale but valid contents */ 733 } else { 734 bp->b_qindex = QUEUE_LRU; 735 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 736 } 737 738 if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) { 739 vfs_bio_need_satisfy(); 740 } 741 742 /* unlock */ 743 bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | 744 B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 745 splx(s); 746 } 747 748 static void 749 vfs_vmio_release(bp) 750 struct buf *bp; 751 { 752 int i; 753 vm_page_t m; 754 755 for (i = 0; i < bp->b_npages; i++) { 756 m = bp->b_pages[i]; 757 bp->b_pages[i] = NULL; 758 vm_page_unwire(m); 759 /* 760 * We don't mess with busy pages, it is 761 * the responsibility of the process that 762 * busied the pages to deal with them. 763 */ 764 if ((m->flags & PG_BUSY) || (m->busy != 0)) 765 continue; 766 767 if (m->wire_count == 0) { 768 769 if (m->flags & PG_WANTED) { 770 m->flags &= ~PG_WANTED; 771 wakeup(m); 772 } 773 774 /* 775 * If this is an async free -- we cannot place 776 * pages onto the cache queue. If it is an 777 * async free, then we don't modify any queues. 778 * This is probably in error (for perf reasons), 779 * and we will eventually need to build 780 * a more complete infrastructure to support I/O 781 * rundown. 782 */ 783 if ((bp->b_flags & B_ASYNC) == 0) { 784 785 /* 786 * In the case of sync buffer frees, we can do pretty much 787 * anything to any of the memory queues. Specifically, 788 * the cache queue is okay to be modified. 789 */ 790 if (m->valid) { 791 if(m->dirty == 0) 792 vm_page_test_dirty(m); 793 /* 794 * this keeps pressure off of the process memory 795 */ 796 if (m->dirty == 0 && m->hold_count == 0) 797 vm_page_cache(m); 798 else 799 vm_page_deactivate(m); 800 } else if (m->hold_count == 0) { 801 vm_page_protect(m, VM_PROT_NONE); 802 vm_page_free(m); 803 } 804 } else { 805 /* 806 * If async, then at least we clear the 807 * act_count. 808 */ 809 m->act_count = 0; 810 } 811 } 812 } 813 bufspace -= bp->b_bufsize; 814 vmiospace -= bp->b_bufsize; 815 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 816 bp->b_npages = 0; 817 bp->b_bufsize = 0; 818 bp->b_flags &= ~B_VMIO; 819 if (bp->b_vp) 820 brelvp(bp); 821 } 822 823 /* 824 * Check to see if a block is currently memory resident. 825 */ 826 struct buf * 827 gbincore(struct vnode * vp, daddr_t blkno) 828 { 829 struct buf *bp; 830 struct bufhashhdr *bh; 831 832 bh = BUFHASH(vp, blkno); 833 bp = bh->lh_first; 834 835 /* Search hash chain */ 836 while (bp != NULL) { 837 /* hit */ 838 if (bp->b_vp == vp && bp->b_lblkno == blkno && 839 (bp->b_flags & B_INVAL) == 0) { 840 break; 841 } 842 bp = bp->b_hash.le_next; 843 } 844 return (bp); 845 } 846 847 /* 848 * this routine implements clustered async writes for 849 * clearing out B_DELWRI buffers... This is much better 850 * than the old way of writing only one buffer at a time. 851 */ 852 int 853 vfs_bio_awrite(struct buf * bp) 854 { 855 int i; 856 daddr_t lblkno = bp->b_lblkno; 857 struct vnode *vp = bp->b_vp; 858 int s; 859 int ncl; 860 struct buf *bpa; 861 int nwritten; 862 863 s = splbio(); 864 /* 865 * right now we support clustered writing only to regular files 866 */ 867 if ((vp->v_type == VREG) && 868 (vp->v_mount != 0) && /* Only on nodes that have the size info */ 869 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 870 int size; 871 int maxcl; 872 873 size = vp->v_mount->mnt_stat.f_iosize; 874 maxcl = MAXPHYS / size; 875 876 for (i = 1; i < maxcl; i++) { 877 if ((bpa = gbincore(vp, lblkno + i)) && 878 ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) == 879 (B_DELWRI | B_CLUSTEROK)) && 880 (bpa->b_bufsize == size)) { 881 if ((bpa->b_blkno == bpa->b_lblkno) || 882 (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) 883 break; 884 } else { 885 break; 886 } 887 } 888 ncl = i; 889 /* 890 * this is a possible cluster write 891 */ 892 if (ncl != 1) { 893 nwritten = cluster_wbuild(vp, size, lblkno, ncl); 894 splx(s); 895 return nwritten; 896 } 897 } 898 bremfree(bp); 899 splx(s); 900 /* 901 * default (old) behavior, writing out only one block 902 */ 903 bp->b_flags |= B_BUSY | B_ASYNC; 904 nwritten = bp->b_bufsize; 905 (void) VOP_BWRITE(bp); 906 return nwritten; 907 } 908 909 910 /* 911 * Find a buffer header which is available for use. 912 */ 913 static struct buf * 914 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize) 915 { 916 struct buf *bp; 917 int nbyteswritten = 0; 918 vm_offset_t addr; 919 static int writerecursion = 0; 920 921 start: 922 if (bufspace >= maxbufspace) 923 goto trytofreespace; 924 925 /* can we constitute a new buffer? */ 926 if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) { 927 #if !defined(MAX_PERF) 928 if (bp->b_qindex != QUEUE_EMPTY) 929 panic("getnewbuf: inconsistent EMPTY queue, qindex=%d", 930 bp->b_qindex); 931 #endif 932 bp->b_flags |= B_BUSY; 933 bremfree(bp); 934 goto fillbuf; 935 } 936 trytofreespace: 937 /* 938 * We keep the file I/O from hogging metadata I/O 939 * This is desirable because file data is cached in the 940 * VM/Buffer cache even if a buffer is freed. 941 */ 942 if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) { 943 #if !defined(MAX_PERF) 944 if (bp->b_qindex != QUEUE_AGE) 945 panic("getnewbuf: inconsistent AGE queue, qindex=%d", 946 bp->b_qindex); 947 #endif 948 } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) { 949 #if !defined(MAX_PERF) 950 if (bp->b_qindex != QUEUE_LRU) 951 panic("getnewbuf: inconsistent LRU queue, qindex=%d", 952 bp->b_qindex); 953 #endif 954 } 955 if (!bp) { 956 /* wait for a free buffer of any kind */ 957 needsbuffer |= VFS_BIO_NEED_ANY; 958 do 959 tsleep(&needsbuffer, (PRIBIO + 1) | slpflag, "newbuf", 960 slptimeo); 961 while (needsbuffer & VFS_BIO_NEED_ANY); 962 return (0); 963 } 964 965 #if defined(DIAGNOSTIC) 966 if (bp->b_flags & B_BUSY) { 967 panic("getnewbuf: busy buffer on free list\n"); 968 } 969 #endif 970 971 /* 972 * We are fairly aggressive about freeing VMIO buffers, but since 973 * the buffering is intact without buffer headers, there is not 974 * much loss. We gain by maintaining non-VMIOed metadata in buffers. 975 */ 976 if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) { 977 if ((bp->b_flags & B_VMIO) == 0 || 978 (vmiospace < maxvmiobufspace)) { 979 --bp->b_usecount; 980 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); 981 if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) { 982 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 983 goto start; 984 } 985 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 986 } 987 } 988 989 990 /* if we are a delayed write, convert to an async write */ 991 if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { 992 993 if (writerecursion > 0) { 994 bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); 995 while (bp) { 996 if ((bp->b_flags & B_DELWRI) == 0) 997 break; 998 bp = TAILQ_NEXT(bp, b_freelist); 999 } 1000 if (bp == NULL) { 1001 bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); 1002 while (bp) { 1003 if ((bp->b_flags & B_DELWRI) == 0) 1004 break; 1005 bp = TAILQ_NEXT(bp, b_freelist); 1006 } 1007 } 1008 if (bp == NULL) 1009 panic("getnewbuf: cannot get buffer, infinite recursion failure"); 1010 } else { 1011 ++writerecursion; 1012 nbyteswritten += vfs_bio_awrite(bp); 1013 --writerecursion; 1014 if (!slpflag && !slptimeo) { 1015 return (0); 1016 } 1017 goto start; 1018 } 1019 } 1020 1021 if (bp->b_flags & B_WANTED) { 1022 bp->b_flags &= ~B_WANTED; 1023 wakeup(bp); 1024 } 1025 bremfree(bp); 1026 bp->b_flags |= B_BUSY; 1027 1028 if (bp->b_flags & B_VMIO) { 1029 bp->b_flags &= ~B_ASYNC; 1030 vfs_vmio_release(bp); 1031 } 1032 1033 if (bp->b_vp) 1034 brelvp(bp); 1035 1036 fillbuf: 1037 /* we are not free, nor do we contain interesting data */ 1038 if (bp->b_rcred != NOCRED) { 1039 crfree(bp->b_rcred); 1040 bp->b_rcred = NOCRED; 1041 } 1042 if (bp->b_wcred != NOCRED) { 1043 crfree(bp->b_wcred); 1044 bp->b_wcred = NOCRED; 1045 } 1046 1047 LIST_REMOVE(bp, b_hash); 1048 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 1049 if (bp->b_bufsize) { 1050 allocbuf(bp, 0); 1051 } 1052 bp->b_flags = B_BUSY; 1053 bp->b_dev = NODEV; 1054 bp->b_vp = NULL; 1055 bp->b_blkno = bp->b_lblkno = 0; 1056 bp->b_iodone = 0; 1057 bp->b_error = 0; 1058 bp->b_resid = 0; 1059 bp->b_bcount = 0; 1060 bp->b_npages = 0; 1061 bp->b_dirtyoff = bp->b_dirtyend = 0; 1062 bp->b_validoff = bp->b_validend = 0; 1063 bp->b_usecount = 4; 1064 1065 maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; 1066 1067 /* 1068 * we assume that buffer_map is not at address 0 1069 */ 1070 addr = 0; 1071 if (maxsize != bp->b_kvasize) { 1072 bfreekva(bp); 1073 1074 /* 1075 * See if we have buffer kva space 1076 */ 1077 if (vm_map_findspace(buffer_map, 1078 vm_map_min(buffer_map), maxsize, &addr)) { 1079 bp->b_flags |= B_INVAL; 1080 brelse(bp); 1081 goto trytofreespace; 1082 } 1083 } 1084 1085 /* 1086 * See if we are below are allocated minimum 1087 */ 1088 if (bufspace >= (maxbufspace + nbyteswritten)) { 1089 bp->b_flags |= B_INVAL; 1090 brelse(bp); 1091 goto trytofreespace; 1092 } 1093 1094 /* 1095 * create a map entry for the buffer -- in essence 1096 * reserving the kva space. 1097 */ 1098 if (addr) { 1099 vm_map_insert(buffer_map, NULL, 0, 1100 addr, addr + maxsize, 1101 VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); 1102 1103 bp->b_kvabase = (caddr_t) addr; 1104 bp->b_kvasize = maxsize; 1105 } 1106 bp->b_data = bp->b_kvabase; 1107 1108 return (bp); 1109 } 1110 1111 static void 1112 waitfreebuffers(int slpflag, int slptimeo) { 1113 while (numfreebuffers < hifreebuffers) { 1114 flushdirtybuffers(slpflag, slptimeo); 1115 if (numfreebuffers < hifreebuffers) 1116 break; 1117 needsbuffer |= VFS_BIO_NEED_FREE; 1118 if (tsleep(&needsbuffer, PRIBIO|slpflag, "biofre", slptimeo)) 1119 break; 1120 } 1121 } 1122 1123 static void 1124 flushdirtybuffers(int slpflag, int slptimeo) { 1125 int s; 1126 static pid_t flushing = 0; 1127 1128 s = splbio(); 1129 1130 if (flushing) { 1131 if (flushing == curproc->p_pid) { 1132 splx(s); 1133 return; 1134 } 1135 while (flushing) { 1136 if (tsleep(&flushing, PRIBIO|slpflag, "biofls", slptimeo)) { 1137 splx(s); 1138 return; 1139 } 1140 } 1141 } 1142 flushing = curproc->p_pid; 1143 1144 while (numdirtybuffers > lodirtybuffers) { 1145 struct buf *bp; 1146 needsbuffer |= VFS_BIO_NEED_LOWLIMIT; 1147 bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); 1148 if (bp == NULL) 1149 bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); 1150 1151 while (bp && ((bp->b_flags & B_DELWRI) == 0)) { 1152 bp = TAILQ_NEXT(bp, b_freelist); 1153 } 1154 1155 if (bp) { 1156 splx(s); 1157 vfs_bio_awrite(bp); 1158 s = splbio(); 1159 continue; 1160 } 1161 break; 1162 } 1163 1164 flushing = 0; 1165 wakeup(&flushing); 1166 splx(s); 1167 } 1168 1169 /* 1170 * Check to see if a block is currently memory resident. 1171 */ 1172 struct buf * 1173 incore(struct vnode * vp, daddr_t blkno) 1174 { 1175 struct buf *bp; 1176 1177 int s = splbio(); 1178 bp = gbincore(vp, blkno); 1179 splx(s); 1180 return (bp); 1181 } 1182 1183 /* 1184 * Returns true if no I/O is needed to access the 1185 * associated VM object. This is like incore except 1186 * it also hunts around in the VM system for the data. 1187 */ 1188 1189 int 1190 inmem(struct vnode * vp, daddr_t blkno) 1191 { 1192 vm_object_t obj; 1193 vm_offset_t toff, tinc; 1194 vm_page_t m; 1195 vm_ooffset_t off; 1196 1197 if (incore(vp, blkno)) 1198 return 1; 1199 if (vp->v_mount == NULL) 1200 return 0; 1201 if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0) 1202 return 0; 1203 1204 obj = vp->v_object; 1205 tinc = PAGE_SIZE; 1206 if (tinc > vp->v_mount->mnt_stat.f_iosize) 1207 tinc = vp->v_mount->mnt_stat.f_iosize; 1208 off = blkno * vp->v_mount->mnt_stat.f_iosize; 1209 1210 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 1211 1212 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 1213 if (!m) 1214 return 0; 1215 if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0) 1216 return 0; 1217 } 1218 return 1; 1219 } 1220 1221 /* 1222 * now we set the dirty range for the buffer -- 1223 * for NFS -- if the file is mapped and pages have 1224 * been written to, let it know. We want the 1225 * entire range of the buffer to be marked dirty if 1226 * any of the pages have been written to for consistancy 1227 * with the b_validoff, b_validend set in the nfs write 1228 * code, and used by the nfs read code. 1229 */ 1230 static void 1231 vfs_setdirty(struct buf *bp) { 1232 int i; 1233 vm_object_t object; 1234 vm_offset_t boffset, offset; 1235 /* 1236 * We qualify the scan for modified pages on whether the 1237 * object has been flushed yet. The OBJ_WRITEABLE flag 1238 * is not cleared simply by protecting pages off. 1239 */ 1240 if ((bp->b_flags & B_VMIO) && 1241 ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) { 1242 /* 1243 * test the pages to see if they have been modified directly 1244 * by users through the VM system. 1245 */ 1246 for (i = 0; i < bp->b_npages; i++) 1247 vm_page_test_dirty(bp->b_pages[i]); 1248 1249 /* 1250 * scan forwards for the first page modified 1251 */ 1252 for (i = 0; i < bp->b_npages; i++) { 1253 if (bp->b_pages[i]->dirty) { 1254 break; 1255 } 1256 } 1257 boffset = (i << PAGE_SHIFT); 1258 if (boffset < bp->b_dirtyoff) { 1259 bp->b_dirtyoff = boffset; 1260 } 1261 1262 /* 1263 * scan backwards for the last page modified 1264 */ 1265 for (i = bp->b_npages - 1; i >= 0; --i) { 1266 if (bp->b_pages[i]->dirty) { 1267 break; 1268 } 1269 } 1270 boffset = (i + 1); 1271 offset = boffset + bp->b_pages[0]->pindex; 1272 if (offset >= object->size) 1273 boffset = object->size - bp->b_pages[0]->pindex; 1274 if (bp->b_dirtyend < (boffset << PAGE_SHIFT)) 1275 bp->b_dirtyend = (boffset << PAGE_SHIFT); 1276 } 1277 } 1278 1279 /* 1280 * Get a block given a specified block and offset into a file/device. 1281 */ 1282 struct buf * 1283 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 1284 { 1285 struct buf *bp; 1286 int s; 1287 struct bufhashhdr *bh; 1288 int maxsize; 1289 static pid_t flushing = 0; 1290 1291 if (vp->v_mount) { 1292 maxsize = vp->v_mount->mnt_stat.f_iosize; 1293 /* 1294 * This happens on mount points. 1295 */ 1296 if (maxsize < size) 1297 maxsize = size; 1298 } else { 1299 maxsize = size; 1300 } 1301 1302 #if !defined(MAX_PERF) 1303 if (size > MAXBSIZE) 1304 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); 1305 #endif 1306 1307 s = splbio(); 1308 loop: 1309 if (numfreebuffers < lofreebuffers) { 1310 waitfreebuffers(slpflag, slptimeo); 1311 } 1312 1313 if ((bp = gbincore(vp, blkno))) { 1314 if (bp->b_flags & B_BUSY) { 1315 bp->b_flags |= B_WANTED; 1316 if (bp->b_usecount < BUF_MAXUSE) 1317 ++bp->b_usecount; 1318 if (!tsleep(bp, 1319 (PRIBIO + 1) | slpflag, "getblk", slptimeo)) 1320 goto loop; 1321 1322 splx(s); 1323 return (struct buf *) NULL; 1324 } 1325 bp->b_flags |= B_BUSY | B_CACHE; 1326 bremfree(bp); 1327 1328 /* 1329 * check for size inconsistancies (note that they shouldn't 1330 * happen but do when filesystems don't handle the size changes 1331 * correctly.) We are conservative on metadata and don't just 1332 * extend the buffer but write and re-constitute it. 1333 */ 1334 1335 if (bp->b_bcount != size) { 1336 if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) { 1337 allocbuf(bp, size); 1338 } else { 1339 bp->b_flags |= B_NOCACHE; 1340 VOP_BWRITE(bp); 1341 goto loop; 1342 } 1343 } 1344 1345 if (bp->b_usecount < BUF_MAXUSE) 1346 ++bp->b_usecount; 1347 splx(s); 1348 return (bp); 1349 } else { 1350 vm_object_t obj; 1351 1352 if ((bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize)) == 0) { 1353 if (slpflag || slptimeo) { 1354 splx(s); 1355 return NULL; 1356 } 1357 goto loop; 1358 } 1359 1360 /* 1361 * This code is used to make sure that a buffer is not 1362 * created while the getnewbuf routine is blocked. 1363 * Normally the vnode is locked so this isn't a problem. 1364 * VBLK type I/O requests, however, don't lock the vnode. 1365 */ 1366 if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) { 1367 bp->b_flags |= B_INVAL; 1368 brelse(bp); 1369 goto loop; 1370 } 1371 1372 /* 1373 * Insert the buffer into the hash, so that it can 1374 * be found by incore. 1375 */ 1376 bp->b_blkno = bp->b_lblkno = blkno; 1377 bgetvp(vp, bp); 1378 LIST_REMOVE(bp, b_hash); 1379 bh = BUFHASH(vp, blkno); 1380 LIST_INSERT_HEAD(bh, bp, b_hash); 1381 1382 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { 1383 bp->b_flags |= (B_VMIO | B_CACHE); 1384 #if defined(VFS_BIO_DEBUG) 1385 if (vp->v_type != VREG && vp->v_type != VBLK) 1386 printf("getblk: vmioing file type %d???\n", vp->v_type); 1387 #endif 1388 } else { 1389 bp->b_flags &= ~B_VMIO; 1390 } 1391 splx(s); 1392 1393 allocbuf(bp, size); 1394 #ifdef PC98 1395 /* 1396 * 1024byte/sector support 1397 */ 1398 #define B_XXX2 0x8000000 1399 if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2; 1400 #endif 1401 return (bp); 1402 } 1403 } 1404 1405 /* 1406 * Get an empty, disassociated buffer of given size. 1407 */ 1408 struct buf * 1409 geteblk(int size) 1410 { 1411 struct buf *bp; 1412 int s; 1413 1414 s = splbio(); 1415 while ((bp = getnewbuf(0, 0, 0, size, MAXBSIZE)) == 0); 1416 splx(s); 1417 allocbuf(bp, size); 1418 bp->b_flags |= B_INVAL; 1419 return (bp); 1420 } 1421 1422 1423 /* 1424 * This code constitutes the buffer memory from either anonymous system 1425 * memory (in the case of non-VMIO operations) or from an associated 1426 * VM object (in the case of VMIO operations). 1427 * 1428 * Note that this code is tricky, and has many complications to resolve 1429 * deadlock or inconsistant data situations. Tread lightly!!! 1430 * 1431 * Modify the length of a buffer's underlying buffer storage without 1432 * destroying information (unless, of course the buffer is shrinking). 1433 */ 1434 int 1435 allocbuf(struct buf * bp, int size) 1436 { 1437 1438 int s; 1439 int newbsize, mbsize; 1440 int i; 1441 1442 #if !defined(MAX_PERF) 1443 if (!(bp->b_flags & B_BUSY)) 1444 panic("allocbuf: buffer not busy"); 1445 1446 if (bp->b_kvasize < size) 1447 panic("allocbuf: buffer too small"); 1448 #endif 1449 1450 if ((bp->b_flags & B_VMIO) == 0) { 1451 caddr_t origbuf; 1452 int origbufsize; 1453 /* 1454 * Just get anonymous memory from the kernel 1455 */ 1456 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1457 #if !defined(NO_B_MALLOC) 1458 if (bp->b_flags & B_MALLOC) 1459 newbsize = mbsize; 1460 else 1461 #endif 1462 newbsize = round_page(size); 1463 1464 if (newbsize < bp->b_bufsize) { 1465 #if !defined(NO_B_MALLOC) 1466 /* 1467 * malloced buffers are not shrunk 1468 */ 1469 if (bp->b_flags & B_MALLOC) { 1470 if (newbsize) { 1471 bp->b_bcount = size; 1472 } else { 1473 free(bp->b_data, M_BIOBUF); 1474 bufspace -= bp->b_bufsize; 1475 bufmallocspace -= bp->b_bufsize; 1476 bp->b_data = bp->b_kvabase; 1477 bp->b_bufsize = 0; 1478 bp->b_bcount = 0; 1479 bp->b_flags &= ~B_MALLOC; 1480 } 1481 return 1; 1482 } 1483 #endif 1484 vm_hold_free_pages( 1485 bp, 1486 (vm_offset_t) bp->b_data + newbsize, 1487 (vm_offset_t) bp->b_data + bp->b_bufsize); 1488 } else if (newbsize > bp->b_bufsize) { 1489 #if !defined(NO_B_MALLOC) 1490 /* 1491 * We only use malloced memory on the first allocation. 1492 * and revert to page-allocated memory when the buffer grows. 1493 */ 1494 if ( (bufmallocspace < maxbufmallocspace) && 1495 (bp->b_bufsize == 0) && 1496 (mbsize <= PAGE_SIZE/2)) { 1497 1498 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); 1499 bp->b_bufsize = mbsize; 1500 bp->b_bcount = size; 1501 bp->b_flags |= B_MALLOC; 1502 bufspace += mbsize; 1503 bufmallocspace += mbsize; 1504 return 1; 1505 } 1506 #endif 1507 origbuf = NULL; 1508 origbufsize = 0; 1509 #if !defined(NO_B_MALLOC) 1510 /* 1511 * If the buffer is growing on it's other-than-first allocation, 1512 * then we revert to the page-allocation scheme. 1513 */ 1514 if (bp->b_flags & B_MALLOC) { 1515 origbuf = bp->b_data; 1516 origbufsize = bp->b_bufsize; 1517 bp->b_data = bp->b_kvabase; 1518 bufspace -= bp->b_bufsize; 1519 bufmallocspace -= bp->b_bufsize; 1520 bp->b_bufsize = 0; 1521 bp->b_flags &= ~B_MALLOC; 1522 newbsize = round_page(newbsize); 1523 } 1524 #endif 1525 vm_hold_load_pages( 1526 bp, 1527 (vm_offset_t) bp->b_data + bp->b_bufsize, 1528 (vm_offset_t) bp->b_data + newbsize); 1529 #if !defined(NO_B_MALLOC) 1530 if (origbuf) { 1531 bcopy(origbuf, bp->b_data, origbufsize); 1532 free(origbuf, M_BIOBUF); 1533 } 1534 #endif 1535 } 1536 } else { 1537 vm_page_t m; 1538 int desiredpages; 1539 1540 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1541 desiredpages = (round_page(newbsize) >> PAGE_SHIFT); 1542 1543 #if !defined(NO_B_MALLOC) 1544 if (bp->b_flags & B_MALLOC) 1545 panic("allocbuf: VMIO buffer can't be malloced"); 1546 #endif 1547 1548 if (newbsize < bp->b_bufsize) { 1549 if (desiredpages < bp->b_npages) { 1550 for (i = desiredpages; i < bp->b_npages; i++) { 1551 /* 1552 * the page is not freed here -- it 1553 * is the responsibility of vnode_pager_setsize 1554 */ 1555 m = bp->b_pages[i]; 1556 #if defined(DIAGNOSTIC) 1557 if (m == bogus_page) 1558 panic("allocbuf: bogus page found"); 1559 #endif 1560 s = splvm(); 1561 while ((m->flags & PG_BUSY) || (m->busy != 0)) { 1562 m->flags |= PG_WANTED; 1563 tsleep(m, PVM, "biodep", 0); 1564 } 1565 splx(s); 1566 1567 bp->b_pages[i] = NULL; 1568 vm_page_unwire(m); 1569 } 1570 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + 1571 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); 1572 bp->b_npages = desiredpages; 1573 } 1574 } else if (newbsize > bp->b_bufsize) { 1575 vm_object_t obj; 1576 vm_offset_t tinc, toff; 1577 vm_ooffset_t off; 1578 vm_pindex_t objoff; 1579 int pageindex, curbpnpages; 1580 struct vnode *vp; 1581 int bsize; 1582 1583 vp = bp->b_vp; 1584 1585 if (vp->v_type == VBLK) 1586 bsize = DEV_BSIZE; 1587 else 1588 bsize = vp->v_mount->mnt_stat.f_iosize; 1589 1590 if (bp->b_npages < desiredpages) { 1591 obj = vp->v_object; 1592 tinc = PAGE_SIZE; 1593 if (tinc > bsize) 1594 tinc = bsize; 1595 off = (vm_ooffset_t) bp->b_lblkno * bsize; 1596 curbpnpages = bp->b_npages; 1597 doretry: 1598 bp->b_flags |= B_CACHE; 1599 bp->b_validoff = bp->b_validend = 0; 1600 for (toff = 0; toff < newbsize; toff += tinc) { 1601 int bytesinpage; 1602 1603 pageindex = toff >> PAGE_SHIFT; 1604 objoff = OFF_TO_IDX(off + toff); 1605 if (pageindex < curbpnpages) { 1606 1607 m = bp->b_pages[pageindex]; 1608 #ifdef VFS_BIO_DIAG 1609 if (m->pindex != objoff) 1610 panic("allocbuf: page changed offset??!!!?"); 1611 #endif 1612 bytesinpage = tinc; 1613 if (tinc > (newbsize - toff)) 1614 bytesinpage = newbsize - toff; 1615 if (bp->b_flags & B_CACHE) 1616 vfs_buf_set_valid(bp, off, toff, bytesinpage, m); 1617 continue; 1618 } 1619 m = vm_page_lookup(obj, objoff); 1620 if (!m) { 1621 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); 1622 if (!m) { 1623 VM_WAIT; 1624 goto doretry; 1625 } 1626 /* 1627 * Normally it is unwise to clear PG_BUSY without 1628 * PAGE_WAKEUP -- but it is okay here, as there is 1629 * no chance for blocking between here and vm_page_alloc 1630 */ 1631 m->flags &= ~PG_BUSY; 1632 vm_page_wire(m); 1633 bp->b_flags &= ~B_CACHE; 1634 } else if (m->flags & PG_BUSY) { 1635 s = splvm(); 1636 if (m->flags & PG_BUSY) { 1637 m->flags |= PG_WANTED; 1638 tsleep(m, PVM, "pgtblk", 0); 1639 } 1640 splx(s); 1641 goto doretry; 1642 } else { 1643 if ((curproc != pageproc) && 1644 ((m->queue - m->pc) == PQ_CACHE) && 1645 ((cnt.v_free_count + cnt.v_cache_count) < 1646 (cnt.v_free_min + cnt.v_cache_min))) { 1647 pagedaemon_wakeup(); 1648 } 1649 bytesinpage = tinc; 1650 if (tinc > (newbsize - toff)) 1651 bytesinpage = newbsize - toff; 1652 if (bp->b_flags & B_CACHE) 1653 vfs_buf_set_valid(bp, off, toff, bytesinpage, m); 1654 vm_page_wire(m); 1655 } 1656 bp->b_pages[pageindex] = m; 1657 curbpnpages = pageindex + 1; 1658 } 1659 if (vp->v_tag == VT_NFS && 1660 vp->v_type != VBLK) { 1661 if (bp->b_dirtyend > 0) { 1662 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); 1663 bp->b_validend = max(bp->b_validend, bp->b_dirtyend); 1664 } 1665 if (bp->b_validend == 0) 1666 bp->b_flags &= ~B_CACHE; 1667 } 1668 bp->b_data = (caddr_t) trunc_page(bp->b_data); 1669 bp->b_npages = curbpnpages; 1670 pmap_qenter((vm_offset_t) bp->b_data, 1671 bp->b_pages, bp->b_npages); 1672 ((vm_offset_t) bp->b_data) |= off & PAGE_MASK; 1673 } 1674 } 1675 } 1676 if (bp->b_flags & B_VMIO) 1677 vmiospace += (newbsize - bp->b_bufsize); 1678 bufspace += (newbsize - bp->b_bufsize); 1679 bp->b_bufsize = newbsize; 1680 bp->b_bcount = size; 1681 return 1; 1682 } 1683 1684 /* 1685 * Wait for buffer I/O completion, returning error status. 1686 */ 1687 int 1688 biowait(register struct buf * bp) 1689 { 1690 int s; 1691 1692 s = splbio(); 1693 while ((bp->b_flags & B_DONE) == 0) 1694 #if defined(NO_SCHEDULE_MODS) 1695 tsleep(bp, PRIBIO, "biowait", 0); 1696 #else 1697 tsleep(bp, curproc->p_usrpri, "biowait", 0); 1698 #endif 1699 splx(s); 1700 if (bp->b_flags & B_EINTR) { 1701 bp->b_flags &= ~B_EINTR; 1702 return (EINTR); 1703 } 1704 if (bp->b_flags & B_ERROR) { 1705 return (bp->b_error ? bp->b_error : EIO); 1706 } else { 1707 return (0); 1708 } 1709 } 1710 1711 /* 1712 * Finish I/O on a buffer, calling an optional function. 1713 * This is usually called from interrupt level, so process blocking 1714 * is not *a good idea*. 1715 */ 1716 void 1717 biodone(register struct buf * bp) 1718 { 1719 int s; 1720 1721 s = splbio(); 1722 1723 #if !defined(MAX_PERF) 1724 if (!(bp->b_flags & B_BUSY)) 1725 panic("biodone: buffer not busy"); 1726 #endif 1727 1728 if (bp->b_flags & B_DONE) { 1729 splx(s); 1730 #if !defined(MAX_PERF) 1731 printf("biodone: buffer already done\n"); 1732 #endif 1733 return; 1734 } 1735 bp->b_flags |= B_DONE; 1736 1737 if ((bp->b_flags & B_READ) == 0) { 1738 vwakeup(bp); 1739 } 1740 #ifdef BOUNCE_BUFFERS 1741 if (bp->b_flags & B_BOUNCE) 1742 vm_bounce_free(bp); 1743 #endif 1744 1745 /* call optional completion function if requested */ 1746 if (bp->b_flags & B_CALL) { 1747 bp->b_flags &= ~B_CALL; 1748 (*bp->b_iodone) (bp); 1749 splx(s); 1750 return; 1751 } 1752 if (bp->b_flags & B_VMIO) { 1753 int i, resid; 1754 vm_ooffset_t foff; 1755 vm_page_t m; 1756 vm_object_t obj; 1757 int iosize; 1758 struct vnode *vp = bp->b_vp; 1759 1760 obj = vp->v_object; 1761 1762 #if defined(VFS_BIO_DEBUG) 1763 if (vp->v_usecount == 0) { 1764 panic("biodone: zero vnode ref count"); 1765 } 1766 1767 if (vp->v_object == NULL) { 1768 panic("biodone: missing VM object"); 1769 } 1770 1771 if ((vp->v_flag & VVMIO) == 0) { 1772 panic("biodone: vnode is not setup for merged cache"); 1773 } 1774 #endif 1775 1776 if (vp->v_type == VBLK) 1777 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1778 else 1779 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1780 #if !defined(MAX_PERF) 1781 if (!obj) { 1782 panic("biodone: no object"); 1783 } 1784 #endif 1785 #if defined(VFS_BIO_DEBUG) 1786 if (obj->paging_in_progress < bp->b_npages) { 1787 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 1788 obj->paging_in_progress, bp->b_npages); 1789 } 1790 #endif 1791 iosize = bp->b_bufsize; 1792 for (i = 0; i < bp->b_npages; i++) { 1793 int bogusflag = 0; 1794 m = bp->b_pages[i]; 1795 if (m == bogus_page) { 1796 bogusflag = 1; 1797 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 1798 if (!m) { 1799 #if defined(VFS_BIO_DEBUG) 1800 printf("biodone: page disappeared\n"); 1801 #endif 1802 --obj->paging_in_progress; 1803 continue; 1804 } 1805 bp->b_pages[i] = m; 1806 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1807 } 1808 #if defined(VFS_BIO_DEBUG) 1809 if (OFF_TO_IDX(foff) != m->pindex) { 1810 printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex); 1811 } 1812 #endif 1813 resid = IDX_TO_OFF(m->pindex + 1) - foff; 1814 if (resid > iosize) 1815 resid = iosize; 1816 /* 1817 * In the write case, the valid and clean bits are 1818 * already changed correctly, so we only need to do this 1819 * here in the read case. 1820 */ 1821 if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { 1822 vfs_page_set_valid(bp, foff, i, m); 1823 } 1824 1825 /* 1826 * when debugging new filesystems or buffer I/O methods, this 1827 * is the most common error that pops up. if you see this, you 1828 * have not set the page busy flag correctly!!! 1829 */ 1830 if (m->busy == 0) { 1831 #if !defined(MAX_PERF) 1832 printf("biodone: page busy < 0, " 1833 "pindex: %d, foff: 0x(%x,%x), " 1834 "resid: %d, index: %d\n", 1835 (int) m->pindex, (int)(foff >> 32), 1836 (int) foff & 0xffffffff, resid, i); 1837 #endif 1838 if (vp->v_type != VBLK) 1839 #if !defined(MAX_PERF) 1840 printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n", 1841 bp->b_vp->v_mount->mnt_stat.f_iosize, 1842 (int) bp->b_lblkno, 1843 bp->b_flags, bp->b_npages); 1844 else 1845 printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", 1846 (int) bp->b_lblkno, 1847 bp->b_flags, bp->b_npages); 1848 printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", 1849 m->valid, m->dirty, m->wire_count); 1850 #endif 1851 panic("biodone: page busy < 0\n"); 1852 } 1853 --m->busy; 1854 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1855 m->flags &= ~PG_WANTED; 1856 wakeup(m); 1857 } 1858 --obj->paging_in_progress; 1859 foff += resid; 1860 iosize -= resid; 1861 } 1862 if (obj && obj->paging_in_progress == 0 && 1863 (obj->flags & OBJ_PIPWNT)) { 1864 obj->flags &= ~OBJ_PIPWNT; 1865 wakeup(obj); 1866 } 1867 } 1868 /* 1869 * For asynchronous completions, release the buffer now. The brelse 1870 * checks for B_WANTED and will do the wakeup there if necessary - so 1871 * no need to do a wakeup here in the async case. 1872 */ 1873 1874 if (bp->b_flags & B_ASYNC) { 1875 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) 1876 brelse(bp); 1877 else 1878 bqrelse(bp); 1879 } else { 1880 bp->b_flags &= ~B_WANTED; 1881 wakeup(bp); 1882 } 1883 splx(s); 1884 } 1885 1886 int 1887 count_lock_queue() 1888 { 1889 int count; 1890 struct buf *bp; 1891 1892 count = 0; 1893 for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]); 1894 bp != NULL; 1895 bp = TAILQ_NEXT(bp, b_freelist)) 1896 count++; 1897 return (count); 1898 } 1899 1900 int vfs_update_interval = 30; 1901 1902 static void 1903 vfs_update() 1904 { 1905 while (1) { 1906 tsleep(&vfs_update_wakeup, PUSER, "update", 1907 hz * vfs_update_interval); 1908 vfs_update_wakeup = 0; 1909 sync(curproc, NULL, NULL); 1910 } 1911 } 1912 1913 static int 1914 sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS 1915 { 1916 int error = sysctl_handle_int(oidp, 1917 oidp->oid_arg1, oidp->oid_arg2, req); 1918 if (!error) 1919 wakeup(&vfs_update_wakeup); 1920 return error; 1921 } 1922 1923 SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW, 1924 &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", ""); 1925 1926 1927 /* 1928 * This routine is called in lieu of iodone in the case of 1929 * incomplete I/O. This keeps the busy status for pages 1930 * consistant. 1931 */ 1932 void 1933 vfs_unbusy_pages(struct buf * bp) 1934 { 1935 int i; 1936 1937 if (bp->b_flags & B_VMIO) { 1938 struct vnode *vp = bp->b_vp; 1939 vm_object_t obj = vp->v_object; 1940 vm_ooffset_t foff; 1941 1942 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1943 1944 for (i = 0; i < bp->b_npages; i++) { 1945 vm_page_t m = bp->b_pages[i]; 1946 1947 if (m == bogus_page) { 1948 m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i); 1949 #if !defined(MAX_PERF) 1950 if (!m) { 1951 panic("vfs_unbusy_pages: page missing\n"); 1952 } 1953 #endif 1954 bp->b_pages[i] = m; 1955 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1956 } 1957 --obj->paging_in_progress; 1958 --m->busy; 1959 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1960 m->flags &= ~PG_WANTED; 1961 wakeup(m); 1962 } 1963 } 1964 if (obj->paging_in_progress == 0 && 1965 (obj->flags & OBJ_PIPWNT)) { 1966 obj->flags &= ~OBJ_PIPWNT; 1967 wakeup(obj); 1968 } 1969 } 1970 } 1971 1972 /* 1973 * Set NFS' b_validoff and b_validend fields from the valid bits 1974 * of a page. If the consumer is not NFS, and the page is not 1975 * valid for the entire range, clear the B_CACHE flag to force 1976 * the consumer to re-read the page. 1977 */ 1978 static void 1979 vfs_buf_set_valid(struct buf *bp, 1980 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 1981 vm_page_t m) 1982 { 1983 if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) { 1984 vm_offset_t svalid, evalid; 1985 int validbits = m->valid; 1986 1987 /* 1988 * This only bothers with the first valid range in the 1989 * page. 1990 */ 1991 svalid = off; 1992 while (validbits && !(validbits & 1)) { 1993 svalid += DEV_BSIZE; 1994 validbits >>= 1; 1995 } 1996 evalid = svalid; 1997 while (validbits & 1) { 1998 evalid += DEV_BSIZE; 1999 validbits >>= 1; 2000 } 2001 /* 2002 * Make sure this range is contiguous with the range 2003 * built up from previous pages. If not, then we will 2004 * just use the range from the previous pages. 2005 */ 2006 if (svalid == bp->b_validend) { 2007 bp->b_validoff = min(bp->b_validoff, svalid); 2008 bp->b_validend = max(bp->b_validend, evalid); 2009 } 2010 } else if (!vm_page_is_valid(m, 2011 (vm_offset_t) ((foff + off) & PAGE_MASK), 2012 size)) { 2013 bp->b_flags &= ~B_CACHE; 2014 } 2015 } 2016 2017 /* 2018 * Set the valid bits in a page, taking care of the b_validoff, 2019 * b_validend fields which NFS uses to optimise small reads. Off is 2020 * the offset within the file and pageno is the page index within the buf. 2021 */ 2022 static void 2023 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) 2024 { 2025 struct vnode *vp = bp->b_vp; 2026 vm_ooffset_t soff, eoff; 2027 2028 soff = off; 2029 eoff = off + min(PAGE_SIZE, bp->b_bufsize); 2030 vm_page_set_invalid(m, 2031 (vm_offset_t) (soff & PAGE_MASK), 2032 (vm_offset_t) (eoff - soff)); 2033 if (vp->v_tag == VT_NFS && vp->v_type != VBLK) { 2034 vm_ooffset_t sv, ev; 2035 off = off - pageno * PAGE_SIZE; 2036 sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1)); 2037 ev = off + (bp->b_validend & ~(DEV_BSIZE - 1)); 2038 soff = max(sv, soff); 2039 eoff = min(ev, eoff); 2040 } 2041 if (eoff > soff) 2042 vm_page_set_validclean(m, 2043 (vm_offset_t) (soff & PAGE_MASK), 2044 (vm_offset_t) (eoff - soff)); 2045 } 2046 2047 /* 2048 * This routine is called before a device strategy routine. 2049 * It is used to tell the VM system that paging I/O is in 2050 * progress, and treat the pages associated with the buffer 2051 * almost as being PG_BUSY. Also the object paging_in_progress 2052 * flag is handled to make sure that the object doesn't become 2053 * inconsistant. 2054 */ 2055 void 2056 vfs_busy_pages(struct buf * bp, int clear_modify) 2057 { 2058 int i; 2059 2060 if (bp->b_flags & B_VMIO) { 2061 struct vnode *vp = bp->b_vp; 2062 vm_object_t obj = vp->v_object; 2063 vm_ooffset_t foff; 2064 2065 if (vp->v_type == VBLK) 2066 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 2067 else 2068 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 2069 vfs_setdirty(bp); 2070 for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) { 2071 vm_page_t m = bp->b_pages[i]; 2072 2073 if ((bp->b_flags & B_CLUSTER) == 0) { 2074 obj->paging_in_progress++; 2075 m->busy++; 2076 } 2077 vm_page_protect(m, VM_PROT_NONE); 2078 if (clear_modify) 2079 vfs_page_set_valid(bp, foff, i, m); 2080 else if (bp->b_bcount >= PAGE_SIZE) { 2081 if (m->valid && (bp->b_flags & B_CACHE) == 0) { 2082 bp->b_pages[i] = bogus_page; 2083 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 2084 } 2085 } 2086 } 2087 } 2088 } 2089 2090 /* 2091 * Tell the VM system that the pages associated with this buffer 2092 * are clean. This is used for delayed writes where the data is 2093 * going to go to disk eventually without additional VM intevention. 2094 */ 2095 void 2096 vfs_clean_pages(struct buf * bp) 2097 { 2098 int i; 2099 2100 if (bp->b_flags & B_VMIO) { 2101 struct vnode *vp = bp->b_vp; 2102 vm_object_t obj = vp->v_object; 2103 vm_ooffset_t foff; 2104 2105 if (vp->v_type == VBLK) 2106 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 2107 else 2108 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 2109 for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) { 2110 vm_page_t m = bp->b_pages[i]; 2111 2112 vfs_page_set_valid(bp, foff, i, m); 2113 } 2114 } 2115 } 2116 2117 void 2118 vfs_bio_clrbuf(struct buf *bp) { 2119 int i; 2120 if( bp->b_flags & B_VMIO) { 2121 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { 2122 int mask; 2123 mask = 0; 2124 for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE) 2125 mask |= (1 << (i/DEV_BSIZE)); 2126 if( bp->b_pages[0]->valid != mask) { 2127 bzero(bp->b_data, bp->b_bufsize); 2128 } 2129 bp->b_pages[0]->valid = mask; 2130 bp->b_resid = 0; 2131 return; 2132 } 2133 for(i=0;i<bp->b_npages;i++) { 2134 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) 2135 continue; 2136 if( bp->b_pages[i]->valid == 0) { 2137 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { 2138 bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE); 2139 } 2140 } else { 2141 int j; 2142 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) { 2143 if( (bp->b_pages[i]->valid & (1<<j)) == 0) 2144 bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE); 2145 } 2146 } 2147 /* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */ 2148 } 2149 bp->b_resid = 0; 2150 } else { 2151 clrbuf(bp); 2152 } 2153 } 2154 2155 /* 2156 * vm_hold_load_pages and vm_hold_unload pages get pages into 2157 * a buffers address space. The pages are anonymous and are 2158 * not associated with a file object. 2159 */ 2160 void 2161 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 2162 { 2163 vm_offset_t pg; 2164 vm_page_t p; 2165 int index; 2166 2167 to = round_page(to); 2168 from = round_page(from); 2169 index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT; 2170 2171 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 2172 2173 tryagain: 2174 2175 p = vm_page_alloc(kernel_object, 2176 ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 2177 VM_ALLOC_NORMAL); 2178 if (!p) { 2179 VM_WAIT; 2180 goto tryagain; 2181 } 2182 vm_page_wire(p); 2183 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 2184 bp->b_pages[index] = p; 2185 PAGE_WAKEUP(p); 2186 } 2187 bp->b_npages = index; 2188 } 2189 2190 void 2191 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 2192 { 2193 vm_offset_t pg; 2194 vm_page_t p; 2195 int index, newnpages; 2196 2197 from = round_page(from); 2198 to = round_page(to); 2199 newnpages = index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT; 2200 2201 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 2202 p = bp->b_pages[index]; 2203 if (p && (index < bp->b_npages)) { 2204 #if !defined(MAX_PERF) 2205 if (p->busy) { 2206 printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", 2207 bp->b_blkno, bp->b_lblkno); 2208 } 2209 #endif 2210 bp->b_pages[index] = NULL; 2211 pmap_kremove(pg); 2212 vm_page_unwire(p); 2213 vm_page_free(p); 2214 } 2215 } 2216 bp->b_npages = newnpages; 2217 } 2218 2219 2220 #include "opt_ddb.h" 2221 #ifdef DDB 2222 #include <ddb/ddb.h> 2223 2224 DB_SHOW_COMMAND(buffer, db_show_buffer) 2225 { 2226 /* get args */ 2227 struct buf *bp = (struct buf *)addr; 2228 2229 if (!have_addr) { 2230 db_printf("usage: show buffer <addr>\n"); 2231 return; 2232 } 2233 2234 db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc, 2235 bp->b_flags, "\20\40bounce\37cluster\36vmio\35ram\34ordered" 2236 "\33paging\32xxx\31writeinprog\30wanted\27relbuf\26tape" 2237 "\25read\24raw\23phys\22clusterok\21malloc\20nocache" 2238 "\17locked\16inval\15gathered\14error\13eintr\12done\11dirty" 2239 "\10delwri\7call\6cache\5busy\4bad\3async\2needcommit\1age"); 2240 db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, " 2241 "b_resid = %ld\nb_dev = 0x%x, b_un.b_addr = %p, " 2242 "b_blkno = %d, b_pblkno = %d\n", 2243 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 2244 bp->b_dev, bp->b_un.b_addr, bp->b_blkno, bp->b_pblkno); 2245 if (bp->b_npages) { 2246 int i; 2247 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); 2248 for (i = 0; i < bp->b_npages; i++) { 2249 vm_page_t m; 2250 m = bp->b_pages[i]; 2251 db_printf("(0x%x, 0x%x, 0x%x)", m->object, m->pindex, 2252 VM_PAGE_TO_PHYS(m)); 2253 if ((i + 1) < bp->b_npages) 2254 db_printf(","); 2255 } 2256 db_printf("\n"); 2257 } 2258 } 2259 #endif /* DDB */ 2260