1 /* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. This work was done expressly for inclusion into FreeBSD. Other use 17 * is allowed if this notation is included. 18 * 5. Modifications may be freely made to this file if the above conditions 19 * are met. 20 * 21 * $FreeBSD$ 22 */ 23 24 /* 25 * this file contains a new buffer I/O scheme implementing a coherent 26 * VM object and buffer cache scheme. Pains have been taken to make 27 * sure that the performance degradation associated with schemes such 28 * as this is not realized. 29 * 30 * Author: John S. Dyson 31 * Significant help during the development and debugging phases 32 * had been provided by David Greenman, also of the FreeBSD core team. 33 */ 34 35 #include "opt_bounce.h" 36 37 #define VMIO 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/sysproto.h> 41 #include <sys/kernel.h> 42 #include <sys/sysctl.h> 43 #include <sys/proc.h> 44 #include <sys/vnode.h> 45 #include <sys/vmmeter.h> 46 #include <vm/vm.h> 47 #include <vm/vm_param.h> 48 #include <vm/vm_prot.h> 49 #include <vm/vm_kern.h> 50 #include <vm/vm_pageout.h> 51 #include <vm/vm_page.h> 52 #include <vm/vm_object.h> 53 #include <vm/vm_extern.h> 54 #include <vm/lock.h> 55 #include <vm/vm_map.h> 56 #include <sys/buf.h> 57 #include <sys/mount.h> 58 #include <sys/malloc.h> 59 #include <sys/resourcevar.h> 60 #include <sys/proc.h> 61 62 #include <miscfs/specfs/specdev.h> 63 64 static void vfs_update __P((void)); 65 static struct proc *updateproc; 66 static struct kproc_desc up_kp = { 67 "update", 68 vfs_update, 69 &updateproc 70 }; 71 SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 72 73 struct buf *buf; /* buffer header pool */ 74 struct swqueue bswlist; 75 76 int count_lock_queue __P((void)); 77 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, 78 vm_offset_t to); 79 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, 80 vm_offset_t to); 81 static void vfs_clean_pages(struct buf * bp); 82 static void vfs_setdirty(struct buf *bp); 83 static void vfs_vmio_release(struct buf *bp); 84 85 int needsbuffer; 86 87 /* 88 * Internal update daemon, process 3 89 * The variable vfs_update_wakeup allows for internal syncs. 90 */ 91 int vfs_update_wakeup; 92 93 94 /* 95 * buffers base kva 96 */ 97 98 /* 99 * bogus page -- for I/O to/from partially complete buffers 100 * this is a temporary solution to the problem, but it is not 101 * really that bad. it would be better to split the buffer 102 * for input in the case of buffers partially already in memory, 103 * but the code is intricate enough already. 104 */ 105 vm_page_t bogus_page; 106 static vm_offset_t bogus_offset; 107 108 static int bufspace, maxbufspace, vmiospace, maxvmiobufspace, 109 bufmallocspace, maxbufmallocspace; 110 111 static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash; 112 static struct bqueues bufqueues[BUFFER_QUEUES]; 113 114 extern int vm_swap_size; 115 116 #define BUF_MAXUSE 16 117 118 /* 119 * Initialize buffer headers and related structures. 120 */ 121 void 122 bufinit() 123 { 124 struct buf *bp; 125 int i; 126 127 TAILQ_INIT(&bswlist); 128 LIST_INIT(&invalhash); 129 130 /* first, make a null hash table */ 131 for (i = 0; i < BUFHSZ; i++) 132 LIST_INIT(&bufhashtbl[i]); 133 134 /* next, make a null set of free lists */ 135 for (i = 0; i < BUFFER_QUEUES; i++) 136 TAILQ_INIT(&bufqueues[i]); 137 138 /* finally, initialize each buffer header and stick on empty q */ 139 for (i = 0; i < nbuf; i++) { 140 bp = &buf[i]; 141 bzero(bp, sizeof *bp); 142 bp->b_flags = B_INVAL; /* we're just an empty header */ 143 bp->b_dev = NODEV; 144 bp->b_rcred = NOCRED; 145 bp->b_wcred = NOCRED; 146 bp->b_qindex = QUEUE_EMPTY; 147 bp->b_vnbufs.le_next = NOLIST; 148 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 149 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 150 } 151 /* 152 * maxbufspace is currently calculated to support all filesystem blocks 153 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer 154 * cache is still the same as it would be for 8K filesystems. This 155 * keeps the size of the buffer cache "in check" for big block filesystems. 156 */ 157 maxbufspace = (nbuf + 8) * DFLTBSIZE; 158 /* 159 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed 160 */ 161 maxvmiobufspace = 2 * maxbufspace / 3; 162 /* 163 * Limit the amount of malloc memory since it is wired permanently into 164 * the kernel space. Even though this is accounted for in the buffer 165 * allocation, we don't want the malloced region to grow uncontrolled. 166 * The malloc scheme improves memory utilization significantly on average 167 * (small) directories. 168 */ 169 maxbufmallocspace = maxbufspace / 20; 170 171 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 172 bogus_page = vm_page_alloc(kernel_object, 173 ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 174 VM_ALLOC_NORMAL); 175 176 } 177 178 /* 179 * Free the kva allocation for a buffer 180 * Must be called only at splbio or higher, 181 * as this is the only locking for buffer_map. 182 */ 183 static void 184 bfreekva(struct buf * bp) 185 { 186 if (bp->b_kvasize == 0) 187 return; 188 189 vm_map_delete(buffer_map, 190 (vm_offset_t) bp->b_kvabase, 191 (vm_offset_t) bp->b_kvabase + bp->b_kvasize); 192 193 bp->b_kvasize = 0; 194 195 } 196 197 /* 198 * remove the buffer from the appropriate free list 199 */ 200 void 201 bremfree(struct buf * bp) 202 { 203 int s = splbio(); 204 205 if (bp->b_qindex != QUEUE_NONE) { 206 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 207 bp->b_qindex = QUEUE_NONE; 208 } else { 209 panic("bremfree: removing a buffer when not on a queue"); 210 } 211 splx(s); 212 } 213 214 /* 215 * Get a buffer with the specified data. Look in the cache first. 216 */ 217 int 218 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, 219 struct buf ** bpp) 220 { 221 struct buf *bp; 222 223 bp = getblk(vp, blkno, size, 0, 0); 224 *bpp = bp; 225 226 /* if not found in cache, do some I/O */ 227 if ((bp->b_flags & B_CACHE) == 0) { 228 if (curproc != NULL) 229 curproc->p_stats->p_ru.ru_inblock++; 230 bp->b_flags |= B_READ; 231 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 232 if (bp->b_rcred == NOCRED) { 233 if (cred != NOCRED) 234 crhold(cred); 235 bp->b_rcred = cred; 236 } 237 vfs_busy_pages(bp, 0); 238 VOP_STRATEGY(bp); 239 return (biowait(bp)); 240 } 241 return (0); 242 } 243 244 /* 245 * Operates like bread, but also starts asynchronous I/O on 246 * read-ahead blocks. 247 */ 248 int 249 breadn(struct vnode * vp, daddr_t blkno, int size, 250 daddr_t * rablkno, int *rabsize, 251 int cnt, struct ucred * cred, struct buf ** bpp) 252 { 253 struct buf *bp, *rabp; 254 int i; 255 int rv = 0, readwait = 0; 256 257 *bpp = bp = getblk(vp, blkno, size, 0, 0); 258 259 /* if not found in cache, do some I/O */ 260 if ((bp->b_flags & B_CACHE) == 0) { 261 if (curproc != NULL) 262 curproc->p_stats->p_ru.ru_inblock++; 263 bp->b_flags |= B_READ; 264 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 265 if (bp->b_rcred == NOCRED) { 266 if (cred != NOCRED) 267 crhold(cred); 268 bp->b_rcred = cred; 269 } 270 vfs_busy_pages(bp, 0); 271 VOP_STRATEGY(bp); 272 ++readwait; 273 } 274 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 275 if (inmem(vp, *rablkno)) 276 continue; 277 rabp = getblk(vp, *rablkno, *rabsize, 0, 0); 278 279 if ((rabp->b_flags & B_CACHE) == 0) { 280 if (curproc != NULL) 281 curproc->p_stats->p_ru.ru_inblock++; 282 rabp->b_flags |= B_READ | B_ASYNC; 283 rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 284 if (rabp->b_rcred == NOCRED) { 285 if (cred != NOCRED) 286 crhold(cred); 287 rabp->b_rcred = cred; 288 } 289 vfs_busy_pages(rabp, 0); 290 VOP_STRATEGY(rabp); 291 } else { 292 brelse(rabp); 293 } 294 } 295 296 if (readwait) { 297 rv = biowait(bp); 298 } 299 return (rv); 300 } 301 302 /* 303 * Write, release buffer on completion. (Done by iodone 304 * if async.) 305 */ 306 int 307 bwrite(struct buf * bp) 308 { 309 int oldflags = bp->b_flags; 310 311 if (bp->b_flags & B_INVAL) { 312 brelse(bp); 313 return (0); 314 } 315 if (!(bp->b_flags & B_BUSY)) 316 panic("bwrite: buffer is not busy???"); 317 318 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 319 bp->b_flags |= B_WRITEINPROG; 320 321 if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) { 322 reassignbuf(bp, bp->b_vp); 323 } 324 325 bp->b_vp->v_numoutput++; 326 vfs_busy_pages(bp, 1); 327 if (curproc != NULL) 328 curproc->p_stats->p_ru.ru_oublock++; 329 VOP_STRATEGY(bp); 330 331 /* 332 * Handle ordered writes here. 333 * If the write was originally flagged as ordered, 334 * then we check to see if it was converted to async. 335 * If it was converted to async, and is done now, then 336 * we release the buffer. Otherwise we clear the 337 * ordered flag because it is not needed anymore. 338 * 339 * Note that biodone has been modified so that it does 340 * not release ordered buffers. This allows us to have 341 * a chance to determine whether or not the driver 342 * has set the async flag in the strategy routine. Otherwise 343 * if biodone was not modified, then the buffer may have been 344 * reused before we have had a chance to check the flag. 345 */ 346 347 if ((oldflags & B_ORDERED) == B_ORDERED) { 348 int s; 349 s = splbio(); 350 if (bp->b_flags & B_ASYNC) { 351 if ((bp->b_flags & B_DONE)) { 352 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) 353 brelse(bp); 354 else 355 bqrelse(bp); 356 } 357 splx(s); 358 return (0); 359 } else { 360 bp->b_flags &= ~B_ORDERED; 361 } 362 splx(s); 363 } 364 365 if ((oldflags & B_ASYNC) == 0) { 366 int rtval = biowait(bp); 367 368 if (oldflags & B_DELWRI) { 369 reassignbuf(bp, bp->b_vp); 370 } 371 brelse(bp); 372 return (rtval); 373 } 374 return (0); 375 } 376 377 int 378 vn_bwrite(ap) 379 struct vop_bwrite_args *ap; 380 { 381 return (bwrite(ap->a_bp)); 382 } 383 384 /* 385 * Delayed write. (Buffer is marked dirty). 386 */ 387 void 388 bdwrite(struct buf * bp) 389 { 390 391 if ((bp->b_flags & B_BUSY) == 0) { 392 panic("bdwrite: buffer is not busy"); 393 } 394 if (bp->b_flags & B_INVAL) { 395 brelse(bp); 396 return; 397 } 398 if (bp->b_flags & B_TAPE) { 399 bawrite(bp); 400 return; 401 } 402 bp->b_flags &= ~(B_READ|B_RELBUF); 403 if ((bp->b_flags & B_DELWRI) == 0) { 404 bp->b_flags |= B_DONE | B_DELWRI; 405 reassignbuf(bp, bp->b_vp); 406 } 407 408 /* 409 * This bmap keeps the system from needing to do the bmap later, 410 * perhaps when the system is attempting to do a sync. Since it 411 * is likely that the indirect block -- or whatever other datastructure 412 * that the filesystem needs is still in memory now, it is a good 413 * thing to do this. Note also, that if the pageout daemon is 414 * requesting a sync -- there might not be enough memory to do 415 * the bmap then... So, this is important to do. 416 */ 417 if( bp->b_lblkno == bp->b_blkno) { 418 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 419 } 420 421 /* 422 * Set the *dirty* buffer range based upon the VM system dirty pages. 423 */ 424 vfs_setdirty(bp); 425 426 /* 427 * We need to do this here to satisfy the vnode_pager and the 428 * pageout daemon, so that it thinks that the pages have been 429 * "cleaned". Note that since the pages are in a delayed write 430 * buffer -- the VFS layer "will" see that the pages get written 431 * out on the next sync, or perhaps the cluster will be completed. 432 */ 433 vfs_clean_pages(bp); 434 bqrelse(bp); 435 return; 436 } 437 438 /* 439 * Asynchronous write. 440 * Start output on a buffer, but do not wait for it to complete. 441 * The buffer is released when the output completes. 442 */ 443 void 444 bawrite(struct buf * bp) 445 { 446 bp->b_flags |= B_ASYNC; 447 (void) VOP_BWRITE(bp); 448 } 449 450 /* 451 * Ordered write. 452 * Start output on a buffer, but only wait for it to complete if the 453 * output device cannot guarantee ordering in some other way. Devices 454 * that can perform asynchronous ordered writes will set the B_ASYNC 455 * flag in their strategy routine. 456 * The buffer is released when the output completes. 457 */ 458 int 459 bowrite(struct buf * bp) 460 { 461 bp->b_flags |= B_ORDERED; 462 return (VOP_BWRITE(bp)); 463 } 464 465 /* 466 * Release a buffer. 467 */ 468 void 469 brelse(struct buf * bp) 470 { 471 int s; 472 473 if (bp->b_flags & B_CLUSTER) { 474 relpbuf(bp); 475 return; 476 } 477 /* anyone need a "free" block? */ 478 s = splbio(); 479 480 /* anyone need this block? */ 481 if (bp->b_flags & B_WANTED) { 482 bp->b_flags &= ~(B_WANTED | B_AGE); 483 wakeup(bp); 484 } 485 486 if (bp->b_flags & B_LOCKED) 487 bp->b_flags &= ~B_ERROR; 488 489 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || 490 (bp->b_bufsize <= 0)) { 491 bp->b_flags |= B_INVAL; 492 bp->b_flags &= ~(B_DELWRI | B_CACHE); 493 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) { 494 if (bp->b_bufsize) 495 allocbuf(bp, 0); 496 brelvp(bp); 497 } 498 } 499 500 /* 501 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 502 * constituted, so the B_INVAL flag is used to *invalidate* the buffer, 503 * but the VM object is kept around. The B_NOCACHE flag is used to 504 * invalidate the pages in the VM object. 505 */ 506 if (bp->b_flags & B_VMIO) { 507 vm_ooffset_t foff; 508 vm_object_t obj; 509 int i, resid; 510 vm_page_t m; 511 struct vnode *vp; 512 int iototal = bp->b_bufsize; 513 514 vp = bp->b_vp; 515 if (!vp) 516 panic("brelse: missing vp"); 517 518 if (bp->b_npages) { 519 vm_pindex_t poff; 520 obj = (vm_object_t) vp->v_object; 521 if (vp->v_type == VBLK) 522 foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT; 523 else 524 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 525 poff = OFF_TO_IDX(foff); 526 for (i = 0; i < bp->b_npages; i++) { 527 m = bp->b_pages[i]; 528 if (m == bogus_page) { 529 m = vm_page_lookup(obj, poff + i); 530 if (!m) { 531 panic("brelse: page missing\n"); 532 } 533 bp->b_pages[i] = m; 534 pmap_qenter(trunc_page(bp->b_data), 535 bp->b_pages, bp->b_npages); 536 } 537 resid = IDX_TO_OFF(m->pindex+1) - foff; 538 if (resid > iototal) 539 resid = iototal; 540 if (resid > 0) { 541 /* 542 * Don't invalidate the page if the local machine has already 543 * modified it. This is the lesser of two evils, and should 544 * be fixed. 545 */ 546 if (bp->b_flags & (B_NOCACHE | B_ERROR)) { 547 vm_page_test_dirty(m); 548 if (m->dirty == 0) { 549 vm_page_set_invalid(m, (vm_offset_t) foff, resid); 550 if (m->valid == 0) 551 vm_page_protect(m, VM_PROT_NONE); 552 } 553 } 554 if (resid >= PAGE_SIZE) { 555 if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { 556 bp->b_flags |= B_INVAL; 557 } 558 } else { 559 if (!vm_page_is_valid(m, 560 (((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) { 561 bp->b_flags |= B_INVAL; 562 } 563 } 564 } 565 foff += resid; 566 iototal -= resid; 567 } 568 } 569 if (bp->b_flags & (B_INVAL | B_RELBUF)) 570 vfs_vmio_release(bp); 571 } 572 if (bp->b_qindex != QUEUE_NONE) 573 panic("brelse: free buffer onto another queue???"); 574 575 /* enqueue */ 576 /* buffers with no memory */ 577 if (bp->b_bufsize == 0) { 578 bp->b_qindex = QUEUE_EMPTY; 579 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 580 LIST_REMOVE(bp, b_hash); 581 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 582 bp->b_dev = NODEV; 583 /* 584 * Get rid of the kva allocation *now* 585 */ 586 bfreekva(bp); 587 if (needsbuffer) { 588 wakeup(&needsbuffer); 589 needsbuffer=0; 590 } 591 /* buffers with junk contents */ 592 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { 593 bp->b_qindex = QUEUE_AGE; 594 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 595 LIST_REMOVE(bp, b_hash); 596 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 597 bp->b_dev = NODEV; 598 if (needsbuffer) { 599 wakeup(&needsbuffer); 600 needsbuffer=0; 601 } 602 /* buffers that are locked */ 603 } else if (bp->b_flags & B_LOCKED) { 604 bp->b_qindex = QUEUE_LOCKED; 605 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 606 /* buffers with stale but valid contents */ 607 } else if (bp->b_flags & B_AGE) { 608 bp->b_qindex = QUEUE_AGE; 609 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 610 if (needsbuffer) { 611 wakeup(&needsbuffer); 612 needsbuffer=0; 613 } 614 /* buffers with valid and quite potentially reuseable contents */ 615 } else { 616 bp->b_qindex = QUEUE_LRU; 617 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 618 if (needsbuffer) { 619 wakeup(&needsbuffer); 620 needsbuffer=0; 621 } 622 } 623 624 /* unlock */ 625 bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | 626 B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 627 splx(s); 628 } 629 630 /* 631 * Release a buffer. 632 */ 633 void 634 bqrelse(struct buf * bp) 635 { 636 int s; 637 638 s = splbio(); 639 640 641 /* anyone need this block? */ 642 if (bp->b_flags & B_WANTED) { 643 bp->b_flags &= ~(B_WANTED | B_AGE); 644 wakeup(bp); 645 } 646 647 if (bp->b_qindex != QUEUE_NONE) 648 panic("bqrelse: free buffer onto another queue???"); 649 650 if (bp->b_flags & B_LOCKED) { 651 bp->b_flags &= ~B_ERROR; 652 bp->b_qindex = QUEUE_LOCKED; 653 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 654 /* buffers with stale but valid contents */ 655 } else { 656 bp->b_qindex = QUEUE_LRU; 657 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 658 if (needsbuffer) { 659 wakeup(&needsbuffer); 660 needsbuffer=0; 661 } 662 } 663 664 /* unlock */ 665 bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | 666 B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 667 splx(s); 668 } 669 670 static void 671 vfs_vmio_release(bp) 672 struct buf *bp; 673 { 674 int i; 675 vm_page_t m; 676 677 for (i = 0; i < bp->b_npages; i++) { 678 m = bp->b_pages[i]; 679 bp->b_pages[i] = NULL; 680 vm_page_unwire(m); 681 /* 682 * We don't mess with busy pages, it is 683 * the responsibility of the process that 684 * busied the pages to deal with them. 685 */ 686 if ((m->flags & PG_BUSY) || (m->busy != 0)) 687 continue; 688 689 if (m->wire_count == 0) { 690 691 if (m->flags & PG_WANTED) { 692 m->flags &= ~PG_WANTED; 693 wakeup(m); 694 } 695 696 /* 697 * If this is an async free -- we cannot place 698 * pages onto the cache queue, so our policy for 699 * such buffers is to avoid the cache queue, and 700 * only modify the active queue or free queue. 701 */ 702 if ((bp->b_flags & B_ASYNC) == 0) { 703 704 /* 705 * In the case of sync buffer frees, we can do pretty much 706 * anything to any of the memory queues. Specifically, 707 * the cache queue is free to be modified. 708 */ 709 if (m->valid) { 710 if(m->dirty == 0) 711 vm_page_test_dirty(m); 712 /* 713 * this keeps pressure off of the process memory 714 */ 715 if ((vm_swap_size == 0) || 716 (cnt.v_free_count < cnt.v_free_min)) { 717 if ((m->dirty == 0) && 718 (m->hold_count == 0)) 719 vm_page_cache(m); 720 else 721 vm_page_deactivate(m); 722 } 723 } else if (m->hold_count == 0) { 724 vm_page_protect(m, VM_PROT_NONE); 725 vm_page_free(m); 726 } 727 } else { 728 /* 729 * If async, then at least we clear the 730 * act_count. 731 */ 732 m->act_count = 0; 733 } 734 } 735 } 736 bufspace -= bp->b_bufsize; 737 vmiospace -= bp->b_bufsize; 738 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 739 bp->b_npages = 0; 740 bp->b_bufsize = 0; 741 bp->b_flags &= ~B_VMIO; 742 if (bp->b_vp) 743 brelvp(bp); 744 } 745 746 /* 747 * Check to see if a block is currently memory resident. 748 */ 749 struct buf * 750 gbincore(struct vnode * vp, daddr_t blkno) 751 { 752 struct buf *bp; 753 struct bufhashhdr *bh; 754 755 bh = BUFHASH(vp, blkno); 756 bp = bh->lh_first; 757 758 /* Search hash chain */ 759 while (bp != NULL) { 760 /* hit */ 761 if (bp->b_vp == vp && bp->b_lblkno == blkno && 762 (bp->b_flags & B_INVAL) == 0) { 763 break; 764 } 765 bp = bp->b_hash.le_next; 766 } 767 return (bp); 768 } 769 770 /* 771 * this routine implements clustered async writes for 772 * clearing out B_DELWRI buffers... This is much better 773 * than the old way of writing only one buffer at a time. 774 */ 775 int 776 vfs_bio_awrite(struct buf * bp) 777 { 778 int i; 779 daddr_t lblkno = bp->b_lblkno; 780 struct vnode *vp = bp->b_vp; 781 int s; 782 int ncl; 783 struct buf *bpa; 784 int nwritten; 785 786 s = splbio(); 787 /* 788 * right now we support clustered writing only to regular files 789 */ 790 if ((vp->v_type == VREG) && 791 (vp->v_mount != 0) && /* Only on nodes that have the size info */ 792 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 793 int size; 794 int maxcl; 795 796 size = vp->v_mount->mnt_stat.f_iosize; 797 maxcl = MAXPHYS / size; 798 799 for (i = 1; i < maxcl; i++) { 800 if ((bpa = gbincore(vp, lblkno + i)) && 801 ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) == 802 (B_DELWRI | B_CLUSTEROK)) && 803 (bpa->b_bufsize == size)) { 804 if ((bpa->b_blkno == bpa->b_lblkno) || 805 (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) 806 break; 807 } else { 808 break; 809 } 810 } 811 ncl = i; 812 /* 813 * this is a possible cluster write 814 */ 815 if (ncl != 1) { 816 nwritten = cluster_wbuild(vp, size, lblkno, ncl); 817 splx(s); 818 return nwritten; 819 } 820 } 821 bremfree(bp); 822 splx(s); 823 /* 824 * default (old) behavior, writing out only one block 825 */ 826 bp->b_flags |= B_BUSY | B_ASYNC; 827 nwritten = bp->b_bufsize; 828 (void) VOP_BWRITE(bp); 829 return nwritten; 830 } 831 832 833 /* 834 * Find a buffer header which is available for use. 835 */ 836 static struct buf * 837 getnewbuf(int slpflag, int slptimeo, int size, int maxsize) 838 { 839 struct buf *bp; 840 int nbyteswritten = 0; 841 vm_offset_t addr; 842 843 start: 844 if (bufspace >= maxbufspace) 845 goto trytofreespace; 846 847 /* can we constitute a new buffer? */ 848 if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) { 849 if (bp->b_qindex != QUEUE_EMPTY) 850 panic("getnewbuf: inconsistent EMPTY queue, qindex=%d", 851 bp->b_qindex); 852 bp->b_flags |= B_BUSY; 853 bremfree(bp); 854 goto fillbuf; 855 } 856 trytofreespace: 857 /* 858 * We keep the file I/O from hogging metadata I/O 859 * This is desirable because file data is cached in the 860 * VM/Buffer cache even if a buffer is freed. 861 */ 862 if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) { 863 if (bp->b_qindex != QUEUE_AGE) 864 panic("getnewbuf: inconsistent AGE queue, qindex=%d", 865 bp->b_qindex); 866 } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) { 867 if (bp->b_qindex != QUEUE_LRU) 868 panic("getnewbuf: inconsistent LRU queue, qindex=%d", 869 bp->b_qindex); 870 } 871 if (!bp) { 872 /* wait for a free buffer of any kind */ 873 needsbuffer = 1; 874 tsleep(&needsbuffer, 875 (PRIBIO + 1) | slpflag, "newbuf", slptimeo); 876 return (0); 877 } 878 879 #if defined(DIAGNOSTIC) 880 if (bp->b_flags & B_BUSY) { 881 panic("getnewbuf: busy buffer on free list\n"); 882 } 883 #endif 884 885 /* 886 * We are fairly aggressive about freeing VMIO buffers, but since 887 * the buffering is intact without buffer headers, there is not 888 * much loss. We gain by maintaining non-VMIOed metadata in buffers. 889 */ 890 if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) { 891 if ((bp->b_flags & B_VMIO) == 0 || 892 (vmiospace < maxvmiobufspace)) { 893 --bp->b_usecount; 894 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); 895 if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) { 896 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 897 goto start; 898 } 899 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 900 } 901 } 902 903 /* if we are a delayed write, convert to an async write */ 904 if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { 905 nbyteswritten += vfs_bio_awrite(bp); 906 if (!slpflag && !slptimeo) { 907 return (0); 908 } 909 goto start; 910 } 911 912 if (bp->b_flags & B_WANTED) { 913 bp->b_flags &= ~B_WANTED; 914 wakeup(bp); 915 } 916 bremfree(bp); 917 bp->b_flags |= B_BUSY; 918 919 if (bp->b_flags & B_VMIO) { 920 bp->b_flags &= ~B_ASYNC; 921 vfs_vmio_release(bp); 922 } 923 924 if (bp->b_vp) 925 brelvp(bp); 926 927 fillbuf: 928 /* we are not free, nor do we contain interesting data */ 929 if (bp->b_rcred != NOCRED) { 930 crfree(bp->b_rcred); 931 bp->b_rcred = NOCRED; 932 } 933 if (bp->b_wcred != NOCRED) { 934 crfree(bp->b_wcred); 935 bp->b_wcred = NOCRED; 936 } 937 938 LIST_REMOVE(bp, b_hash); 939 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 940 if (bp->b_bufsize) { 941 allocbuf(bp, 0); 942 } 943 bp->b_flags = B_BUSY; 944 bp->b_dev = NODEV; 945 bp->b_vp = NULL; 946 bp->b_blkno = bp->b_lblkno = 0; 947 bp->b_iodone = 0; 948 bp->b_error = 0; 949 bp->b_resid = 0; 950 bp->b_bcount = 0; 951 bp->b_npages = 0; 952 bp->b_dirtyoff = bp->b_dirtyend = 0; 953 bp->b_validoff = bp->b_validend = 0; 954 bp->b_usecount = 4; 955 956 maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; 957 958 /* 959 * we assume that buffer_map is not at address 0 960 */ 961 addr = 0; 962 if (maxsize != bp->b_kvasize) { 963 bfreekva(bp); 964 965 /* 966 * See if we have buffer kva space 967 */ 968 if (vm_map_findspace(buffer_map, 969 vm_map_min(buffer_map), maxsize, &addr)) { 970 bp->b_flags |= B_INVAL; 971 brelse(bp); 972 goto trytofreespace; 973 } 974 } 975 976 /* 977 * See if we are below are allocated minimum 978 */ 979 if (bufspace >= (maxbufspace + nbyteswritten)) { 980 bp->b_flags |= B_INVAL; 981 brelse(bp); 982 goto trytofreespace; 983 } 984 985 /* 986 * create a map entry for the buffer -- in essence 987 * reserving the kva space. 988 */ 989 if (addr) { 990 vm_map_insert(buffer_map, NULL, 0, 991 addr, addr + maxsize, 992 VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); 993 994 bp->b_kvabase = (caddr_t) addr; 995 bp->b_kvasize = maxsize; 996 } 997 bp->b_data = bp->b_kvabase; 998 999 return (bp); 1000 } 1001 1002 /* 1003 * Check to see if a block is currently memory resident. 1004 */ 1005 struct buf * 1006 incore(struct vnode * vp, daddr_t blkno) 1007 { 1008 struct buf *bp; 1009 1010 int s = splbio(); 1011 bp = gbincore(vp, blkno); 1012 splx(s); 1013 return (bp); 1014 } 1015 1016 /* 1017 * Returns true if no I/O is needed to access the 1018 * associated VM object. This is like incore except 1019 * it also hunts around in the VM system for the data. 1020 */ 1021 1022 int 1023 inmem(struct vnode * vp, daddr_t blkno) 1024 { 1025 vm_object_t obj; 1026 vm_offset_t toff, tinc; 1027 vm_page_t m; 1028 vm_ooffset_t off; 1029 1030 if (incore(vp, blkno)) 1031 return 1; 1032 if (vp->v_mount == NULL) 1033 return 0; 1034 if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0) 1035 return 0; 1036 1037 obj = vp->v_object; 1038 tinc = PAGE_SIZE; 1039 if (tinc > vp->v_mount->mnt_stat.f_iosize) 1040 tinc = vp->v_mount->mnt_stat.f_iosize; 1041 off = blkno * vp->v_mount->mnt_stat.f_iosize; 1042 1043 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 1044 1045 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 1046 if (!m) 1047 return 0; 1048 if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0) 1049 return 0; 1050 } 1051 return 1; 1052 } 1053 1054 /* 1055 * now we set the dirty range for the buffer -- 1056 * for NFS -- if the file is mapped and pages have 1057 * been written to, let it know. We want the 1058 * entire range of the buffer to be marked dirty if 1059 * any of the pages have been written to for consistancy 1060 * with the b_validoff, b_validend set in the nfs write 1061 * code, and used by the nfs read code. 1062 */ 1063 static void 1064 vfs_setdirty(struct buf *bp) { 1065 int i; 1066 vm_object_t object; 1067 vm_offset_t boffset, offset; 1068 /* 1069 * We qualify the scan for modified pages on whether the 1070 * object has been flushed yet. The OBJ_WRITEABLE flag 1071 * is not cleared simply by protecting pages off. 1072 */ 1073 if ((bp->b_flags & B_VMIO) && 1074 ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) { 1075 /* 1076 * test the pages to see if they have been modified directly 1077 * by users through the VM system. 1078 */ 1079 for (i = 0; i < bp->b_npages; i++) 1080 vm_page_test_dirty(bp->b_pages[i]); 1081 1082 /* 1083 * scan forwards for the first page modified 1084 */ 1085 for (i = 0; i < bp->b_npages; i++) { 1086 if (bp->b_pages[i]->dirty) { 1087 break; 1088 } 1089 } 1090 boffset = (i << PAGE_SHIFT); 1091 if (boffset < bp->b_dirtyoff) { 1092 bp->b_dirtyoff = boffset; 1093 } 1094 1095 /* 1096 * scan backwards for the last page modified 1097 */ 1098 for (i = bp->b_npages - 1; i >= 0; --i) { 1099 if (bp->b_pages[i]->dirty) { 1100 break; 1101 } 1102 } 1103 boffset = (i + 1); 1104 offset = boffset + bp->b_pages[0]->pindex; 1105 if (offset >= object->size) 1106 boffset = object->size - bp->b_pages[0]->pindex; 1107 if (bp->b_dirtyend < (boffset << PAGE_SHIFT)) 1108 bp->b_dirtyend = (boffset << PAGE_SHIFT); 1109 } 1110 } 1111 1112 /* 1113 * Get a block given a specified block and offset into a file/device. 1114 */ 1115 struct buf * 1116 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 1117 { 1118 struct buf *bp; 1119 int s; 1120 struct bufhashhdr *bh; 1121 int maxsize; 1122 1123 if (vp->v_mount) { 1124 maxsize = vp->v_mount->mnt_stat.f_iosize; 1125 /* 1126 * This happens on mount points. 1127 */ 1128 if (maxsize < size) 1129 maxsize = size; 1130 } else { 1131 maxsize = size; 1132 } 1133 1134 if (size > MAXBSIZE) 1135 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); 1136 1137 s = splbio(); 1138 loop: 1139 if ((bp = gbincore(vp, blkno))) { 1140 if (bp->b_flags & B_BUSY) { 1141 bp->b_flags |= B_WANTED; 1142 if (bp->b_usecount < BUF_MAXUSE) 1143 ++bp->b_usecount; 1144 if (!tsleep(bp, 1145 (PRIBIO + 1) | slpflag, "getblk", slptimeo)) 1146 goto loop; 1147 1148 splx(s); 1149 return (struct buf *) NULL; 1150 } 1151 bp->b_flags |= B_BUSY | B_CACHE; 1152 bremfree(bp); 1153 1154 /* 1155 * check for size inconsistancies (note that they shouldn't happen 1156 * but do when filesystems don't handle the size changes correctly.) 1157 * We are conservative on metadata and don't just extend the buffer 1158 * but write and re-constitute it. 1159 */ 1160 1161 if (bp->b_bcount != size) { 1162 if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) { 1163 allocbuf(bp, size); 1164 } else { 1165 bp->b_flags |= B_NOCACHE; 1166 VOP_BWRITE(bp); 1167 goto loop; 1168 } 1169 } 1170 1171 if (bp->b_usecount < BUF_MAXUSE) 1172 ++bp->b_usecount; 1173 splx(s); 1174 return (bp); 1175 } else { 1176 vm_object_t obj; 1177 1178 if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == 0) { 1179 if (slpflag || slptimeo) { 1180 splx(s); 1181 return NULL; 1182 } 1183 goto loop; 1184 } 1185 1186 /* 1187 * This code is used to make sure that a buffer is not 1188 * created while the getnewbuf routine is blocked. 1189 * Normally the vnode is locked so this isn't a problem. 1190 * VBLK type I/O requests, however, don't lock the vnode. 1191 */ 1192 if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) { 1193 bp->b_flags |= B_INVAL; 1194 brelse(bp); 1195 goto loop; 1196 } 1197 1198 /* 1199 * Insert the buffer into the hash, so that it can 1200 * be found by incore. 1201 */ 1202 bp->b_blkno = bp->b_lblkno = blkno; 1203 bgetvp(vp, bp); 1204 LIST_REMOVE(bp, b_hash); 1205 bh = BUFHASH(vp, blkno); 1206 LIST_INSERT_HEAD(bh, bp, b_hash); 1207 1208 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { 1209 bp->b_flags |= (B_VMIO | B_CACHE); 1210 #if defined(VFS_BIO_DEBUG) 1211 if (vp->v_type != VREG && vp->v_type != VBLK) 1212 printf("getblk: vmioing file type %d???\n", vp->v_type); 1213 #endif 1214 } else { 1215 bp->b_flags &= ~B_VMIO; 1216 } 1217 splx(s); 1218 1219 allocbuf(bp, size); 1220 #ifdef PC98 1221 /* 1222 * 1024byte/sector support 1223 */ 1224 #define B_XXX2 0x8000000 1225 if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2; 1226 #endif 1227 return (bp); 1228 } 1229 } 1230 1231 /* 1232 * Get an empty, disassociated buffer of given size. 1233 */ 1234 struct buf * 1235 geteblk(int size) 1236 { 1237 struct buf *bp; 1238 int s; 1239 1240 s = splbio(); 1241 while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0); 1242 splx(s); 1243 allocbuf(bp, size); 1244 bp->b_flags |= B_INVAL; 1245 return (bp); 1246 } 1247 1248 1249 /* 1250 * This code constitutes the buffer memory from either anonymous system 1251 * memory (in the case of non-VMIO operations) or from an associated 1252 * VM object (in the case of VMIO operations). 1253 * 1254 * Note that this code is tricky, and has many complications to resolve 1255 * deadlock or inconsistant data situations. Tread lightly!!! 1256 * 1257 * Modify the length of a buffer's underlying buffer storage without 1258 * destroying information (unless, of course the buffer is shrinking). 1259 */ 1260 int 1261 allocbuf(struct buf * bp, int size) 1262 { 1263 1264 int s; 1265 int newbsize, mbsize; 1266 int i; 1267 1268 if (!(bp->b_flags & B_BUSY)) 1269 panic("allocbuf: buffer not busy"); 1270 1271 if (bp->b_kvasize < size) 1272 panic("allocbuf: buffer too small"); 1273 1274 if ((bp->b_flags & B_VMIO) == 0) { 1275 caddr_t origbuf; 1276 int origbufsize; 1277 /* 1278 * Just get anonymous memory from the kernel 1279 */ 1280 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1281 #if !defined(NO_B_MALLOC) 1282 if (bp->b_flags & B_MALLOC) 1283 newbsize = mbsize; 1284 else 1285 #endif 1286 newbsize = round_page(size); 1287 1288 if (newbsize < bp->b_bufsize) { 1289 #if !defined(NO_B_MALLOC) 1290 /* 1291 * malloced buffers are not shrunk 1292 */ 1293 if (bp->b_flags & B_MALLOC) { 1294 if (newbsize) { 1295 bp->b_bcount = size; 1296 } else { 1297 free(bp->b_data, M_BIOBUF); 1298 bufspace -= bp->b_bufsize; 1299 bufmallocspace -= bp->b_bufsize; 1300 bp->b_data = bp->b_kvabase; 1301 bp->b_bufsize = 0; 1302 bp->b_bcount = 0; 1303 bp->b_flags &= ~B_MALLOC; 1304 } 1305 return 1; 1306 } 1307 #endif 1308 vm_hold_free_pages( 1309 bp, 1310 (vm_offset_t) bp->b_data + newbsize, 1311 (vm_offset_t) bp->b_data + bp->b_bufsize); 1312 } else if (newbsize > bp->b_bufsize) { 1313 #if !defined(NO_B_MALLOC) 1314 /* 1315 * We only use malloced memory on the first allocation. 1316 * and revert to page-allocated memory when the buffer grows. 1317 */ 1318 if ( (bufmallocspace < maxbufmallocspace) && 1319 (bp->b_bufsize == 0) && 1320 (mbsize <= PAGE_SIZE/2)) { 1321 1322 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); 1323 bp->b_bufsize = mbsize; 1324 bp->b_bcount = size; 1325 bp->b_flags |= B_MALLOC; 1326 bufspace += mbsize; 1327 bufmallocspace += mbsize; 1328 return 1; 1329 } 1330 #endif 1331 origbuf = NULL; 1332 origbufsize = 0; 1333 #if !defined(NO_B_MALLOC) 1334 /* 1335 * If the buffer is growing on it's other-than-first allocation, 1336 * then we revert to the page-allocation scheme. 1337 */ 1338 if (bp->b_flags & B_MALLOC) { 1339 origbuf = bp->b_data; 1340 origbufsize = bp->b_bufsize; 1341 bp->b_data = bp->b_kvabase; 1342 bufspace -= bp->b_bufsize; 1343 bufmallocspace -= bp->b_bufsize; 1344 bp->b_bufsize = 0; 1345 bp->b_flags &= ~B_MALLOC; 1346 newbsize = round_page(newbsize); 1347 } 1348 #endif 1349 vm_hold_load_pages( 1350 bp, 1351 (vm_offset_t) bp->b_data + bp->b_bufsize, 1352 (vm_offset_t) bp->b_data + newbsize); 1353 #if !defined(NO_B_MALLOC) 1354 if (origbuf) { 1355 bcopy(origbuf, bp->b_data, origbufsize); 1356 free(origbuf, M_BIOBUF); 1357 } 1358 #endif 1359 } 1360 } else { 1361 vm_page_t m; 1362 int desiredpages; 1363 1364 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1365 desiredpages = (round_page(newbsize) >> PAGE_SHIFT); 1366 1367 #if !defined(NO_B_MALLOC) 1368 if (bp->b_flags & B_MALLOC) 1369 panic("allocbuf: VMIO buffer can't be malloced"); 1370 #endif 1371 1372 if (newbsize < bp->b_bufsize) { 1373 if (desiredpages < bp->b_npages) { 1374 for (i = desiredpages; i < bp->b_npages; i++) { 1375 /* 1376 * the page is not freed here -- it 1377 * is the responsibility of vnode_pager_setsize 1378 */ 1379 m = bp->b_pages[i]; 1380 #if defined(DIAGNOSTIC) 1381 if (m == bogus_page) 1382 panic("allocbuf: bogus page found"); 1383 #endif 1384 s = splvm(); 1385 while ((m->flags & PG_BUSY) || (m->busy != 0)) { 1386 m->flags |= PG_WANTED; 1387 tsleep(m, PVM, "biodep", 0); 1388 } 1389 splx(s); 1390 1391 bp->b_pages[i] = NULL; 1392 vm_page_unwire(m); 1393 } 1394 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + 1395 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); 1396 bp->b_npages = desiredpages; 1397 } 1398 } else if (newbsize > bp->b_bufsize) { 1399 vm_object_t obj; 1400 vm_offset_t tinc, toff; 1401 vm_ooffset_t off; 1402 vm_pindex_t objoff; 1403 int pageindex, curbpnpages; 1404 struct vnode *vp; 1405 int bsize; 1406 1407 vp = bp->b_vp; 1408 1409 if (vp->v_type == VBLK) 1410 bsize = DEV_BSIZE; 1411 else 1412 bsize = vp->v_mount->mnt_stat.f_iosize; 1413 1414 if (bp->b_npages < desiredpages) { 1415 obj = vp->v_object; 1416 tinc = PAGE_SIZE; 1417 if (tinc > bsize) 1418 tinc = bsize; 1419 off = (vm_ooffset_t) bp->b_lblkno * bsize; 1420 curbpnpages = bp->b_npages; 1421 doretry: 1422 bp->b_flags |= B_CACHE; 1423 for (toff = 0; toff < newbsize; toff += tinc) { 1424 int bytesinpage; 1425 1426 pageindex = toff >> PAGE_SHIFT; 1427 objoff = OFF_TO_IDX(off + toff); 1428 if (pageindex < curbpnpages) { 1429 1430 m = bp->b_pages[pageindex]; 1431 #ifdef VFS_BIO_DIAG 1432 if (m->pindex != objoff) 1433 panic("allocbuf: page changed offset??!!!?"); 1434 #endif 1435 bytesinpage = tinc; 1436 if (tinc > (newbsize - toff)) 1437 bytesinpage = newbsize - toff; 1438 if ((bp->b_flags & B_CACHE) && 1439 !vm_page_is_valid(m, 1440 (vm_offset_t) ((toff + off) & PAGE_MASK), 1441 bytesinpage)) { 1442 bp->b_flags &= ~B_CACHE; 1443 } 1444 continue; 1445 } 1446 m = vm_page_lookup(obj, objoff); 1447 if (!m) { 1448 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); 1449 if (!m) { 1450 VM_WAIT; 1451 goto doretry; 1452 } 1453 /* 1454 * Normally it is unwise to clear PG_BUSY without 1455 * PAGE_WAKEUP -- but it is okay here, as there is 1456 * no chance for blocking between here and vm_page_alloc 1457 */ 1458 m->flags &= ~PG_BUSY; 1459 vm_page_wire(m); 1460 bp->b_flags &= ~B_CACHE; 1461 } else if (m->flags & PG_BUSY) { 1462 s = splvm(); 1463 if (m->flags & PG_BUSY) { 1464 m->flags |= PG_WANTED; 1465 tsleep(m, PVM, "pgtblk", 0); 1466 } 1467 splx(s); 1468 goto doretry; 1469 } else { 1470 if ((curproc != pageproc) && 1471 ((m->queue - m->pc) == PQ_CACHE) && 1472 ((cnt.v_free_count + cnt.v_cache_count) < 1473 (cnt.v_free_min + cnt.v_cache_min))) { 1474 pagedaemon_wakeup(); 1475 } 1476 bytesinpage = tinc; 1477 if (tinc > (newbsize - toff)) 1478 bytesinpage = newbsize - toff; 1479 if ((bp->b_flags & B_CACHE) && 1480 !vm_page_is_valid(m, 1481 (vm_offset_t) ((toff + off) & PAGE_MASK), 1482 bytesinpage)) { 1483 bp->b_flags &= ~B_CACHE; 1484 } 1485 vm_page_wire(m); 1486 } 1487 bp->b_pages[pageindex] = m; 1488 curbpnpages = pageindex + 1; 1489 } 1490 bp->b_data = (caddr_t) trunc_page(bp->b_data); 1491 bp->b_npages = curbpnpages; 1492 pmap_qenter((vm_offset_t) bp->b_data, 1493 bp->b_pages, bp->b_npages); 1494 ((vm_offset_t) bp->b_data) |= off & PAGE_MASK; 1495 } 1496 } 1497 } 1498 if (bp->b_flags & B_VMIO) 1499 vmiospace += bp->b_bufsize; 1500 bufspace += (newbsize - bp->b_bufsize); 1501 bp->b_bufsize = newbsize; 1502 bp->b_bcount = size; 1503 return 1; 1504 } 1505 1506 /* 1507 * Wait for buffer I/O completion, returning error status. 1508 */ 1509 int 1510 biowait(register struct buf * bp) 1511 { 1512 int s; 1513 1514 s = splbio(); 1515 while ((bp->b_flags & B_DONE) == 0) 1516 tsleep(bp, PRIBIO, "biowait", 0); 1517 splx(s); 1518 if (bp->b_flags & B_EINTR) { 1519 bp->b_flags &= ~B_EINTR; 1520 return (EINTR); 1521 } 1522 if (bp->b_flags & B_ERROR) { 1523 return (bp->b_error ? bp->b_error : EIO); 1524 } else { 1525 return (0); 1526 } 1527 } 1528 1529 /* 1530 * Finish I/O on a buffer, calling an optional function. 1531 * This is usually called from interrupt level, so process blocking 1532 * is not *a good idea*. 1533 */ 1534 void 1535 biodone(register struct buf * bp) 1536 { 1537 int s; 1538 1539 s = splbio(); 1540 if (!(bp->b_flags & B_BUSY)) 1541 panic("biodone: buffer not busy"); 1542 1543 if (bp->b_flags & B_DONE) { 1544 splx(s); 1545 printf("biodone: buffer already done\n"); 1546 return; 1547 } 1548 bp->b_flags |= B_DONE; 1549 1550 if ((bp->b_flags & B_READ) == 0) { 1551 vwakeup(bp); 1552 } 1553 #ifdef BOUNCE_BUFFERS 1554 if (bp->b_flags & B_BOUNCE) 1555 vm_bounce_free(bp); 1556 #endif 1557 1558 /* call optional completion function if requested */ 1559 if (bp->b_flags & B_CALL) { 1560 bp->b_flags &= ~B_CALL; 1561 (*bp->b_iodone) (bp); 1562 splx(s); 1563 return; 1564 } 1565 if (bp->b_flags & B_VMIO) { 1566 int i, resid; 1567 vm_ooffset_t foff; 1568 vm_page_t m; 1569 vm_object_t obj; 1570 int iosize; 1571 struct vnode *vp = bp->b_vp; 1572 1573 if (vp->v_type == VBLK) 1574 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1575 else 1576 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1577 obj = vp->v_object; 1578 if (!obj) { 1579 panic("biodone: no object"); 1580 } 1581 #if defined(VFS_BIO_DEBUG) 1582 if (obj->paging_in_progress < bp->b_npages) { 1583 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 1584 obj->paging_in_progress, bp->b_npages); 1585 } 1586 #endif 1587 iosize = bp->b_bufsize; 1588 for (i = 0; i < bp->b_npages; i++) { 1589 int bogusflag = 0; 1590 m = bp->b_pages[i]; 1591 if (m == bogus_page) { 1592 bogusflag = 1; 1593 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 1594 if (!m) { 1595 #if defined(VFS_BIO_DEBUG) 1596 printf("biodone: page disappeared\n"); 1597 #endif 1598 --obj->paging_in_progress; 1599 continue; 1600 } 1601 bp->b_pages[i] = m; 1602 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1603 } 1604 #if defined(VFS_BIO_DEBUG) 1605 if (OFF_TO_IDX(foff) != m->pindex) { 1606 printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex); 1607 } 1608 #endif 1609 resid = IDX_TO_OFF(m->pindex + 1) - foff; 1610 if (resid > iosize) 1611 resid = iosize; 1612 /* 1613 * In the write case, the valid and clean bits are 1614 * already changed correctly, so we only need to do this 1615 * here in the read case. 1616 */ 1617 if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { 1618 vm_page_set_validclean(m, 1619 (vm_offset_t) (foff & PAGE_MASK), resid); 1620 } 1621 1622 /* 1623 * when debugging new filesystems or buffer I/O methods, this 1624 * is the most common error that pops up. if you see this, you 1625 * have not set the page busy flag correctly!!! 1626 */ 1627 if (m->busy == 0) { 1628 printf("biodone: page busy < 0, " 1629 "pindex: %d, foff: 0x(%x,%x), " 1630 "resid: %d, index: %d\n", 1631 (int) m->pindex, (int)(foff >> 32), 1632 (int) foff & 0xffffffff, resid, i); 1633 if (vp->v_type != VBLK) 1634 printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n", 1635 bp->b_vp->v_mount->mnt_stat.f_iosize, 1636 (int) bp->b_lblkno, 1637 bp->b_flags, bp->b_npages); 1638 else 1639 printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", 1640 (int) bp->b_lblkno, 1641 bp->b_flags, bp->b_npages); 1642 printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", 1643 m->valid, m->dirty, m->wire_count); 1644 panic("biodone: page busy < 0\n"); 1645 } 1646 --m->busy; 1647 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1648 m->flags &= ~PG_WANTED; 1649 wakeup(m); 1650 } 1651 --obj->paging_in_progress; 1652 foff += resid; 1653 iosize -= resid; 1654 } 1655 if (obj && obj->paging_in_progress == 0 && 1656 (obj->flags & OBJ_PIPWNT)) { 1657 obj->flags &= ~OBJ_PIPWNT; 1658 wakeup(obj); 1659 } 1660 } 1661 /* 1662 * For asynchronous completions, release the buffer now. The brelse 1663 * checks for B_WANTED and will do the wakeup there if necessary - so 1664 * no need to do a wakeup here in the async case. 1665 */ 1666 1667 if (bp->b_flags & B_ASYNC) { 1668 if ((bp->b_flags & B_ORDERED) == 0) { 1669 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) 1670 brelse(bp); 1671 else 1672 bqrelse(bp); 1673 } 1674 } else { 1675 bp->b_flags &= ~B_WANTED; 1676 wakeup(bp); 1677 } 1678 splx(s); 1679 } 1680 1681 int 1682 count_lock_queue() 1683 { 1684 int count; 1685 struct buf *bp; 1686 1687 count = 0; 1688 for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]); 1689 bp != NULL; 1690 bp = TAILQ_NEXT(bp, b_freelist)) 1691 count++; 1692 return (count); 1693 } 1694 1695 int vfs_update_interval = 30; 1696 1697 static void 1698 vfs_update() 1699 { 1700 while (1) { 1701 tsleep(&vfs_update_wakeup, PUSER, "update", 1702 hz * vfs_update_interval); 1703 vfs_update_wakeup = 0; 1704 sync(curproc, NULL, NULL); 1705 } 1706 } 1707 1708 static int 1709 sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS 1710 { 1711 int error = sysctl_handle_int(oidp, 1712 oidp->oid_arg1, oidp->oid_arg2, req); 1713 if (!error) 1714 wakeup(&vfs_update_wakeup); 1715 return error; 1716 } 1717 1718 SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW, 1719 &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", ""); 1720 1721 1722 /* 1723 * This routine is called in lieu of iodone in the case of 1724 * incomplete I/O. This keeps the busy status for pages 1725 * consistant. 1726 */ 1727 void 1728 vfs_unbusy_pages(struct buf * bp) 1729 { 1730 int i; 1731 1732 if (bp->b_flags & B_VMIO) { 1733 struct vnode *vp = bp->b_vp; 1734 vm_object_t obj = vp->v_object; 1735 vm_ooffset_t foff; 1736 1737 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1738 1739 for (i = 0; i < bp->b_npages; i++) { 1740 vm_page_t m = bp->b_pages[i]; 1741 1742 if (m == bogus_page) { 1743 m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i); 1744 if (!m) { 1745 panic("vfs_unbusy_pages: page missing\n"); 1746 } 1747 bp->b_pages[i] = m; 1748 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1749 } 1750 --obj->paging_in_progress; 1751 --m->busy; 1752 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1753 m->flags &= ~PG_WANTED; 1754 wakeup(m); 1755 } 1756 } 1757 if (obj->paging_in_progress == 0 && 1758 (obj->flags & OBJ_PIPWNT)) { 1759 obj->flags &= ~OBJ_PIPWNT; 1760 wakeup(obj); 1761 } 1762 } 1763 } 1764 1765 /* 1766 * This routine is called before a device strategy routine. 1767 * It is used to tell the VM system that paging I/O is in 1768 * progress, and treat the pages associated with the buffer 1769 * almost as being PG_BUSY. Also the object paging_in_progress 1770 * flag is handled to make sure that the object doesn't become 1771 * inconsistant. 1772 */ 1773 void 1774 vfs_busy_pages(struct buf * bp, int clear_modify) 1775 { 1776 int i; 1777 1778 if (bp->b_flags & B_VMIO) { 1779 vm_object_t obj = bp->b_vp->v_object; 1780 vm_ooffset_t foff; 1781 int iocount = bp->b_bufsize; 1782 1783 if (bp->b_vp->v_type == VBLK) 1784 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1785 else 1786 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1787 vfs_setdirty(bp); 1788 for (i = 0; i < bp->b_npages; i++) { 1789 vm_page_t m = bp->b_pages[i]; 1790 int resid = IDX_TO_OFF(m->pindex + 1) - foff; 1791 1792 if (resid > iocount) 1793 resid = iocount; 1794 if ((bp->b_flags & B_CLUSTER) == 0) { 1795 obj->paging_in_progress++; 1796 m->busy++; 1797 } 1798 vm_page_protect(m, VM_PROT_NONE); 1799 if (clear_modify) { 1800 vm_page_set_validclean(m, 1801 (vm_offset_t) (foff & PAGE_MASK), resid); 1802 } else if (bp->b_bcount >= PAGE_SIZE) { 1803 if (m->valid && (bp->b_flags & B_CACHE) == 0) { 1804 bp->b_pages[i] = bogus_page; 1805 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1806 } 1807 } 1808 foff += resid; 1809 iocount -= resid; 1810 } 1811 } 1812 } 1813 1814 /* 1815 * Tell the VM system that the pages associated with this buffer 1816 * are clean. This is used for delayed writes where the data is 1817 * going to go to disk eventually without additional VM intevention. 1818 */ 1819 void 1820 vfs_clean_pages(struct buf * bp) 1821 { 1822 int i; 1823 1824 if (bp->b_flags & B_VMIO) { 1825 vm_ooffset_t foff; 1826 int iocount = bp->b_bufsize; 1827 1828 if (bp->b_vp->v_type == VBLK) 1829 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1830 else 1831 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1832 1833 for (i = 0; i < bp->b_npages; i++) { 1834 vm_page_t m = bp->b_pages[i]; 1835 int resid = IDX_TO_OFF(m->pindex + 1) - foff; 1836 1837 if (resid > iocount) 1838 resid = iocount; 1839 if (resid > 0) { 1840 vm_page_set_validclean(m, 1841 ((vm_offset_t) foff & PAGE_MASK), resid); 1842 } 1843 foff += resid; 1844 iocount -= resid; 1845 } 1846 } 1847 } 1848 1849 void 1850 vfs_bio_clrbuf(struct buf *bp) { 1851 int i; 1852 if( bp->b_flags & B_VMIO) { 1853 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { 1854 int mask; 1855 mask = 0; 1856 for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE) 1857 mask |= (1 << (i/DEV_BSIZE)); 1858 if( bp->b_pages[0]->valid != mask) { 1859 bzero(bp->b_data, bp->b_bufsize); 1860 } 1861 bp->b_pages[0]->valid = mask; 1862 bp->b_resid = 0; 1863 return; 1864 } 1865 for(i=0;i<bp->b_npages;i++) { 1866 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) 1867 continue; 1868 if( bp->b_pages[i]->valid == 0) { 1869 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { 1870 bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE); 1871 } 1872 } else { 1873 int j; 1874 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) { 1875 if( (bp->b_pages[i]->valid & (1<<j)) == 0) 1876 bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE); 1877 } 1878 } 1879 /* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */ 1880 } 1881 bp->b_resid = 0; 1882 } else { 1883 clrbuf(bp); 1884 } 1885 } 1886 1887 /* 1888 * vm_hold_load_pages and vm_hold_unload pages get pages into 1889 * a buffers address space. The pages are anonymous and are 1890 * not associated with a file object. 1891 */ 1892 void 1893 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 1894 { 1895 vm_offset_t pg; 1896 vm_page_t p; 1897 int index; 1898 1899 to = round_page(to); 1900 from = round_page(from); 1901 index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT; 1902 1903 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 1904 1905 tryagain: 1906 1907 p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 1908 VM_ALLOC_NORMAL); 1909 if (!p) { 1910 VM_WAIT; 1911 goto tryagain; 1912 } 1913 vm_page_wire(p); 1914 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 1915 bp->b_pages[index] = p; 1916 PAGE_WAKEUP(p); 1917 } 1918 bp->b_npages = to >> PAGE_SHIFT; 1919 } 1920 1921 void 1922 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 1923 { 1924 vm_offset_t pg; 1925 vm_page_t p; 1926 int index; 1927 1928 from = round_page(from); 1929 to = round_page(to); 1930 index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT; 1931 1932 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 1933 p = bp->b_pages[index]; 1934 if (p && (index < bp->b_npages)) { 1935 if (p->busy) { 1936 printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", 1937 bp->b_blkno, bp->b_lblkno); 1938 } 1939 bp->b_pages[index] = NULL; 1940 pmap_kremove(pg); 1941 vm_page_unwire(p); 1942 vm_page_free(p); 1943 } 1944 } 1945 bp->b_npages = from >> PAGE_SHIFT; 1946 } 1947