1 /* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 * 19 * $Id: vfs_bio.c,v 1.8 1994/08/08 15:40:59 wollman Exp $ 20 */ 21 22 #include <sys/param.h> 23 #include <sys/systm.h> 24 #include <sys/kernel.h> 25 #include <sys/proc.h> 26 #include <sys/vnode.h> 27 #include <sys/buf.h> 28 #include <sys/mount.h> 29 #include <sys/malloc.h> 30 #include <sys/resourcevar.h> 31 #include <vm/vm.h> 32 #include <vm/vm_pageout.h> 33 34 #include <miscfs/specfs/specdev.h> 35 36 struct buf *buf; /* buffer header pool */ 37 int nbuf; /* number of buffer headers calculated elsewhere */ 38 struct swqueue bswlist; 39 struct buf *bclnlist; /* Head of cleaned page list. */ 40 41 extern vm_map_t buffer_map, io_map; 42 43 void vm_hold_free_pages(vm_offset_t from, vm_offset_t to); 44 void vm_hold_load_pages(vm_offset_t from, vm_offset_t to); 45 46 int needsbuffer; 47 48 /* 49 * Internal update daemon, process 3 50 * The variable vfs_update_wakeup allows for internal syncs. 51 */ 52 int vfs_update_wakeup; 53 54 /* 55 * Initialize buffer headers and related structures. 56 */ 57 void bufinit() 58 { 59 struct buf *bp; 60 int i; 61 62 TAILQ_INIT(&bswlist); 63 LIST_INIT(&invalhash); 64 65 /* first, make a null hash table */ 66 for(i=0;i<BUFHSZ;i++) 67 LIST_INIT(&bufhashtbl[i]); 68 69 /* next, make a null set of free lists */ 70 for(i=0;i<BUFFER_QUEUES;i++) 71 TAILQ_INIT(&bufqueues[i]); 72 73 /* finally, initialize each buffer header and stick on empty q */ 74 for(i=0;i<nbuf;i++) { 75 bp = &buf[i]; 76 bzero(bp, sizeof *bp); 77 bp->b_flags = B_INVAL; /* we're just an empty header */ 78 bp->b_dev = NODEV; 79 bp->b_vp = NULL; 80 bp->b_rcred = NOCRED; 81 bp->b_wcred = NOCRED; 82 bp->b_qindex = QUEUE_EMPTY; 83 bp->b_vnbufs.le_next = NOLIST; 84 bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE); 85 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 86 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 87 } 88 } 89 90 /* 91 * remove the buffer from the appropriate free list 92 */ 93 void 94 bremfree(struct buf *bp) 95 { 96 int s = splbio(); 97 if( bp->b_qindex != QUEUE_NONE) { 98 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 99 bp->b_qindex = QUEUE_NONE; 100 } else { 101 panic("bremfree: removing a buffer when not on a queue"); 102 } 103 splx(s); 104 } 105 106 /* 107 * Get a buffer with the specified data. Look in the cache first. 108 */ 109 int 110 bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred, 111 struct buf **bpp) 112 { 113 struct buf *bp; 114 115 bp = getblk (vp, blkno, size, 0, 0); 116 *bpp = bp; 117 118 /* if not found in cache, do some I/O */ 119 if ((bp->b_flags & B_CACHE) == 0) { 120 if (curproc && curproc->p_stats) /* count block I/O */ 121 curproc->p_stats->p_ru.ru_inblock++; 122 bp->b_flags |= B_READ; 123 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 124 if( bp->b_rcred == NOCRED) { 125 if (cred != NOCRED) 126 crhold(cred); 127 bp->b_rcred = cred; 128 } 129 VOP_STRATEGY(bp); 130 return( biowait (bp)); 131 } 132 133 return (0); 134 } 135 136 /* 137 * Operates like bread, but also starts asynchronous I/O on 138 * read-ahead blocks. 139 */ 140 int 141 breadn(struct vnode *vp, daddr_t blkno, int size, 142 daddr_t *rablkno, int *rabsize, 143 int cnt, struct ucred *cred, struct buf **bpp) 144 { 145 struct buf *bp, *rabp; 146 int i; 147 int rv = 0, readwait = 0; 148 149 *bpp = bp = getblk (vp, blkno, size, 0, 0); 150 151 /* if not found in cache, do some I/O */ 152 if ((bp->b_flags & B_CACHE) == 0) { 153 if (curproc && curproc->p_stats) /* count block I/O */ 154 curproc->p_stats->p_ru.ru_inblock++; 155 bp->b_flags |= B_READ; 156 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 157 if( bp->b_rcred == NOCRED) { 158 if (cred != NOCRED) 159 crhold(cred); 160 bp->b_rcred = cred; 161 } 162 VOP_STRATEGY(bp); 163 ++readwait; 164 } 165 166 for(i=0;i<cnt;i++, rablkno++, rabsize++) { 167 if( incore(vp, *rablkno)) { 168 continue; 169 } 170 rabp = getblk (vp, *rablkno, *rabsize, 0, 0); 171 172 if ((rabp->b_flags & B_CACHE) == 0) { 173 if (curproc && curproc->p_stats) 174 curproc->p_stats->p_ru.ru_inblock++; 175 rabp->b_flags |= B_READ | B_ASYNC; 176 rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 177 if( rabp->b_rcred == NOCRED) { 178 if (cred != NOCRED) 179 crhold(cred); 180 rabp->b_rcred = cred; 181 } 182 VOP_STRATEGY(rabp); 183 } else { 184 brelse(rabp); 185 } 186 } 187 188 if( readwait) { 189 rv = biowait (bp); 190 } 191 192 return (rv); 193 } 194 195 /* 196 * Write, release buffer on completion. (Done by iodone 197 * if async.) 198 */ 199 int 200 bwrite(struct buf *bp) 201 { 202 int oldflags = bp->b_flags; 203 204 if(bp->b_flags & B_INVAL) { 205 brelse(bp); 206 return (0); 207 } 208 209 if(!(bp->b_flags & B_BUSY)) 210 panic("bwrite: buffer is not busy???"); 211 212 bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); 213 bp->b_flags |= B_WRITEINPROG; 214 215 if (oldflags & B_ASYNC) { 216 if (oldflags & B_DELWRI) { 217 reassignbuf(bp, bp->b_vp); 218 } else if( curproc) { 219 ++curproc->p_stats->p_ru.ru_oublock; 220 } 221 } 222 223 bp->b_vp->v_numoutput++; 224 VOP_STRATEGY(bp); 225 226 if( (oldflags & B_ASYNC) == 0) { 227 int rtval = biowait(bp); 228 if (oldflags & B_DELWRI) { 229 reassignbuf(bp, bp->b_vp); 230 } else if( curproc) { 231 ++curproc->p_stats->p_ru.ru_oublock; 232 } 233 brelse(bp); 234 return (rtval); 235 } 236 237 return(0); 238 } 239 240 int 241 vn_bwrite(ap) 242 struct vop_bwrite_args *ap; 243 { 244 return (bwrite(ap->a_bp)); 245 } 246 247 /* 248 * Delayed write. (Buffer is marked dirty). 249 */ 250 void 251 bdwrite(struct buf *bp) 252 { 253 254 if((bp->b_flags & B_BUSY) == 0) { 255 panic("bdwrite: buffer is not busy"); 256 } 257 258 if(bp->b_flags & B_INVAL) { 259 brelse(bp); 260 return; 261 } 262 263 if(bp->b_flags & B_TAPE) { 264 bawrite(bp); 265 return; 266 } 267 268 bp->b_flags &= ~B_READ; 269 if( (bp->b_flags & B_DELWRI) == 0) { 270 if( curproc) 271 ++curproc->p_stats->p_ru.ru_oublock; 272 bp->b_flags |= B_DONE|B_DELWRI; 273 reassignbuf(bp, bp->b_vp); 274 } 275 brelse(bp); 276 return; 277 } 278 279 /* 280 * Asynchronous write. 281 * Start output on a buffer, but do not wait for it to complete. 282 * The buffer is released when the output completes. 283 */ 284 void 285 bawrite(struct buf *bp) 286 { 287 bp->b_flags |= B_ASYNC; 288 (void) bwrite(bp); 289 } 290 291 /* 292 * Release a buffer. 293 */ 294 void 295 brelse(struct buf *bp) 296 { 297 int x; 298 299 /* anyone need a "free" block? */ 300 x=splbio(); 301 if (needsbuffer) { 302 needsbuffer = 0; 303 wakeup((caddr_t)&needsbuffer); 304 } 305 306 /* anyone need this block? */ 307 if (bp->b_flags & B_WANTED) { 308 bp->b_flags &= ~(B_WANTED|B_AGE); 309 wakeup((caddr_t)bp); 310 } 311 312 if (bp->b_flags & B_LOCKED) 313 bp->b_flags &= ~B_ERROR; 314 315 if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) || 316 (bp->b_bufsize <= 0)) { 317 bp->b_flags |= B_INVAL; 318 bp->b_flags &= ~(B_DELWRI|B_CACHE); 319 if(bp->b_vp) 320 brelvp(bp); 321 } 322 323 if( bp->b_qindex != QUEUE_NONE) 324 panic("brelse: free buffer onto another queue???"); 325 326 /* enqueue */ 327 /* buffers with no memory */ 328 if(bp->b_bufsize == 0) { 329 bp->b_qindex = QUEUE_EMPTY; 330 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 331 LIST_REMOVE(bp, b_hash); 332 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 333 bp->b_dev = NODEV; 334 /* buffers with junk contents */ 335 } else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) { 336 bp->b_qindex = QUEUE_AGE; 337 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 338 LIST_REMOVE(bp, b_hash); 339 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 340 bp->b_dev = NODEV; 341 /* buffers that are locked */ 342 } else if(bp->b_flags & B_LOCKED) { 343 bp->b_qindex = QUEUE_LOCKED; 344 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 345 /* buffers with stale but valid contents */ 346 } else if(bp->b_flags & B_AGE) { 347 bp->b_qindex = QUEUE_AGE; 348 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 349 /* buffers with valid and quite potentially reuseable contents */ 350 } else { 351 bp->b_qindex = QUEUE_LRU; 352 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 353 } 354 355 /* unlock */ 356 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE); 357 splx(x); 358 } 359 360 int freebufspace; 361 int allocbufspace; 362 363 /* 364 * Find a buffer header which is available for use. 365 */ 366 struct buf * 367 getnewbuf(int slpflag, int slptimeo) 368 { 369 struct buf *bp; 370 int s; 371 s = splbio(); 372 start: 373 /* can we constitute a new buffer? */ 374 if (bp = bufqueues[QUEUE_EMPTY].tqh_first) { 375 if( bp->b_qindex != QUEUE_EMPTY) 376 panic("getnewbuf: inconsistent EMPTY queue"); 377 bremfree(bp); 378 goto fillbuf; 379 } 380 381 tryfree: 382 if (bp = bufqueues[QUEUE_AGE].tqh_first) { 383 if( bp->b_qindex != QUEUE_AGE) 384 panic("getnewbuf: inconsistent AGE queue"); 385 bremfree(bp); 386 } else if (bp = bufqueues[QUEUE_LRU].tqh_first) { 387 if( bp->b_qindex != QUEUE_LRU) 388 panic("getnewbuf: inconsistent LRU queue"); 389 bremfree(bp); 390 } else { 391 /* wait for a free buffer of any kind */ 392 needsbuffer = 1; 393 tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0); 394 splx(s); 395 return (0); 396 } 397 398 399 /* if we are a delayed write, convert to an async write */ 400 if (bp->b_flags & B_DELWRI) { 401 bp->b_flags |= B_BUSY; 402 bawrite (bp); 403 goto start; 404 } 405 406 if(bp->b_vp) 407 brelvp(bp); 408 409 /* we are not free, nor do we contain interesting data */ 410 if (bp->b_rcred != NOCRED) 411 crfree(bp->b_rcred); 412 if (bp->b_wcred != NOCRED) 413 crfree(bp->b_wcred); 414 fillbuf: 415 bp->b_flags = B_BUSY; 416 LIST_REMOVE(bp, b_hash); 417 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 418 splx(s); 419 bp->b_dev = NODEV; 420 bp->b_vp = NULL; 421 bp->b_blkno = bp->b_lblkno = 0; 422 bp->b_iodone = 0; 423 bp->b_error = 0; 424 bp->b_resid = 0; 425 bp->b_bcount = 0; 426 bp->b_wcred = bp->b_rcred = NOCRED; 427 bp->b_dirtyoff = bp->b_dirtyend = 0; 428 bp->b_validoff = bp->b_validend = 0; 429 return (bp); 430 } 431 432 /* 433 * Check to see if a block is currently memory resident. 434 */ 435 struct buf * 436 incore(struct vnode *vp, daddr_t blkno) 437 { 438 struct buf *bp; 439 struct bufhashhdr *bh; 440 441 int s = splbio(); 442 443 bh = BUFHASH(vp, blkno); 444 bp = bh->lh_first; 445 446 /* Search hash chain */ 447 while (bp) { 448 if( (bp < buf) || (bp >= buf + nbuf)) { 449 printf("incore: buf out of range: %lx, hash: %d\n", 450 bp, bh - bufhashtbl); 451 panic("incore: buf fault"); 452 } 453 /* hit */ 454 if (bp->b_lblkno == blkno && bp->b_vp == vp 455 && (bp->b_flags & B_INVAL) == 0) { 456 splx(s); 457 return (bp); 458 } 459 bp = bp->b_hash.le_next; 460 } 461 splx(s); 462 463 return(0); 464 } 465 466 /* 467 * Get a block given a specified block and offset into a file/device. 468 */ 469 struct buf * 470 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 471 { 472 struct buf *bp; 473 int s; 474 struct bufhashhdr *bh; 475 476 s = splbio(); 477 loop: 478 if (bp = incore(vp, blkno)) { 479 if (bp->b_flags & B_BUSY) { 480 bp->b_flags |= B_WANTED; 481 tsleep ((caddr_t)bp, PRIBIO, "getblk", 0); 482 goto loop; 483 } 484 bp->b_flags |= B_BUSY | B_CACHE; 485 bremfree(bp); 486 /* 487 * check for size inconsistancies 488 */ 489 if (bp->b_bcount != size) { 490 printf("getblk: invalid buffer size: %d\n", bp->b_bcount); 491 bp->b_flags |= B_INVAL; 492 bwrite(bp); 493 goto loop; 494 } 495 } else { 496 497 if ((bp = getnewbuf(0, 0)) == 0) 498 goto loop; 499 allocbuf(bp, size); 500 /* 501 * have to check again, because of a possible 502 * race condition. 503 */ 504 if (incore( vp, blkno)) { 505 allocbuf(bp, 0); 506 bp->b_flags |= B_INVAL; 507 brelse(bp); 508 goto loop; 509 } 510 bp->b_blkno = bp->b_lblkno = blkno; 511 bgetvp(vp, bp); 512 LIST_REMOVE(bp, b_hash); 513 bh = BUFHASH(vp, blkno); 514 LIST_INSERT_HEAD(bh, bp, b_hash); 515 } 516 splx(s); 517 return (bp); 518 } 519 520 /* 521 * Get an empty, disassociated buffer of given size. 522 */ 523 struct buf * 524 geteblk(int size) 525 { 526 struct buf *bp; 527 while ((bp = getnewbuf(0, 0)) == 0) 528 ; 529 allocbuf(bp, size); 530 bp->b_flags |= B_INVAL; 531 return (bp); 532 } 533 534 /* 535 * Modify the length of a buffer's underlying buffer storage without 536 * destroying information (unless, of course the buffer is shrinking). 537 */ 538 void 539 allocbuf(struct buf *bp, int size) 540 { 541 542 int newbsize = round_page(size); 543 544 if( newbsize == bp->b_bufsize) { 545 bp->b_bcount = size; 546 return; 547 } else if( newbsize < bp->b_bufsize) { 548 vm_hold_free_pages( 549 (vm_offset_t) bp->b_data + newbsize, 550 (vm_offset_t) bp->b_data + bp->b_bufsize); 551 } else if( newbsize > bp->b_bufsize) { 552 vm_hold_load_pages( 553 (vm_offset_t) bp->b_data + bp->b_bufsize, 554 (vm_offset_t) bp->b_data + newbsize); 555 } 556 557 /* adjust buffer cache's idea of memory allocated to buffer contents */ 558 freebufspace -= newbsize - bp->b_bufsize; 559 allocbufspace += newbsize - bp->b_bufsize; 560 561 bp->b_bufsize = newbsize; 562 bp->b_bcount = size; 563 } 564 565 /* 566 * Wait for buffer I/O completion, returning error status. 567 */ 568 int 569 biowait(register struct buf *bp) 570 { 571 int s; 572 573 s = splbio(); 574 while ((bp->b_flags & B_DONE) == 0) 575 tsleep((caddr_t)bp, PRIBIO, "biowait", 0); 576 if((bp->b_flags & B_ERROR) || bp->b_error) { 577 if ((bp->b_flags & B_INVAL) == 0) { 578 bp->b_flags |= B_INVAL; 579 bp->b_dev = NODEV; 580 LIST_REMOVE(bp, b_hash); 581 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 582 } 583 if (!bp->b_error) 584 bp->b_error = EIO; 585 else 586 bp->b_flags |= B_ERROR; 587 splx(s); 588 return (bp->b_error); 589 } else { 590 splx(s); 591 return (0); 592 } 593 } 594 595 /* 596 * Finish I/O on a buffer, calling an optional function. 597 * This is usually called from interrupt level, so process blocking 598 * is not *a good idea*. 599 */ 600 void 601 biodone(register struct buf *bp) 602 { 603 int s; 604 s = splbio(); 605 bp->b_flags |= B_DONE; 606 607 if ((bp->b_flags & B_READ) == 0) { 608 vwakeup(bp); 609 } 610 611 if (bp->b_flags & B_BOUNCE) 612 vm_bounce_free(bp); 613 614 /* call optional completion function if requested */ 615 if (bp->b_flags & B_CALL) { 616 bp->b_flags &= ~B_CALL; 617 (*bp->b_iodone)(bp); 618 splx(s); 619 return; 620 } 621 622 /* 623 * For asynchronous completions, release the buffer now. The brelse 624 * checks for B_WANTED and will do the wakeup there if necessary - 625 * so no need to do a wakeup here in the async case. 626 */ 627 628 if (bp->b_flags & B_ASYNC) { 629 brelse(bp); 630 } else { 631 bp->b_flags &= ~B_WANTED; 632 wakeup((caddr_t) bp); 633 } 634 splx(s); 635 } 636 637 int 638 count_lock_queue() 639 { 640 int count; 641 struct buf *bp; 642 643 count = 0; 644 for(bp = bufqueues[QUEUE_LOCKED].tqh_first; 645 bp != NULL; 646 bp = bp->b_freelist.tqe_next) 647 count++; 648 return(count); 649 } 650 651 int vfs_update_interval = 30; 652 653 void 654 vfs_update() { 655 (void) spl0(); 656 while(1) { 657 tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update", 658 hz * vfs_update_interval); 659 vfs_update_wakeup = 0; 660 sync(curproc, NULL, NULL); 661 } 662 } 663 664 /* 665 * these routines are not in the correct place (yet) 666 * also they work *ONLY* for kernel_pmap!!! 667 */ 668 void 669 vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) { 670 vm_offset_t pg; 671 vm_page_t p; 672 vm_offset_t from = round_page(froma); 673 vm_offset_t to = round_page(toa); 674 675 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 676 vm_offset_t pa; 677 678 tryagain: 679 if (cnt.v_free_count <= cnt.v_free_reserved) { 680 VM_WAIT; 681 goto tryagain; 682 } 683 684 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS); 685 if( !p) { 686 VM_WAIT; 687 goto tryagain; 688 } 689 690 vm_page_wire(p); 691 pmap_kenter( pg, VM_PAGE_TO_PHYS(p)); 692 } 693 } 694 695 void 696 vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) { 697 vm_offset_t pg; 698 vm_page_t p; 699 vm_offset_t from = round_page(froma); 700 vm_offset_t to = round_page(toa); 701 702 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 703 p = PHYS_TO_VM_PAGE( pmap_kextract( pg)); 704 pmap_kremove( pg); 705 vm_page_free(p); 706 } 707 } 708 709 void 710 bufstats() 711 { 712 } 713 714