1 /* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 */ 19 20 #include <sys/param.h> 21 #include <sys/systm.h> 22 #include <sys/kernel.h> 23 #include <sys/proc.h> 24 #include <sys/vnode.h> 25 #include <sys/buf.h> 26 #include <sys/mount.h> 27 #include <sys/malloc.h> 28 #include <sys/resourcevar.h> 29 #include <vm/vm.h> 30 #include <vm/vm_pageout.h> 31 32 #include <miscfs/specfs/specdev.h> 33 34 struct buf *buf; /* buffer header pool */ 35 int nbuf; /* number of buffer headers calculated elsewhere */ 36 37 extern vm_map_t buffer_map, io_map; 38 39 void vm_hold_free_pages(vm_offset_t from, vm_offset_t to); 40 void vm_hold_load_pages(vm_offset_t from, vm_offset_t to); 41 42 int needsbuffer; 43 44 /* 45 * Internal update daemon, process 3 46 * The variable vfs_update_wakeup allows for internal syncs. 47 */ 48 int vfs_update_wakeup; 49 50 /* 51 * Initialize buffer headers and related structures. 52 */ 53 void bufinit() 54 { 55 struct buf *bp; 56 int i; 57 58 TAILQ_INIT(&bswlist); 59 LIST_INIT(&invalhash); 60 61 /* first, make a null hash table */ 62 for(i=0;i<BUFHSZ;i++) 63 LIST_INIT(&bufhashtbl[i]); 64 65 /* next, make a null set of free lists */ 66 for(i=0;i<BUFFER_QUEUES;i++) 67 TAILQ_INIT(&bufqueues[i]); 68 69 /* finally, initialize each buffer header and stick on empty q */ 70 for(i=0;i<nbuf;i++) { 71 bp = &buf[i]; 72 bzero(bp, sizeof *bp); 73 bp->b_flags = B_INVAL; /* we're just an empty header */ 74 bp->b_dev = NODEV; 75 bp->b_vp = NULL; 76 bp->b_rcred = NOCRED; 77 bp->b_wcred = NOCRED; 78 bp->b_qindex = QUEUE_EMPTY; 79 bp->b_vnbufs.le_next = NOLIST; 80 bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE); 81 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 82 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 83 } 84 } 85 86 /* 87 * remove the buffer from the appropriate free list 88 */ 89 void 90 bremfree(struct buf *bp) 91 { 92 int s = splbio(); 93 if( bp->b_qindex != QUEUE_NONE) { 94 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 95 bp->b_qindex = QUEUE_NONE; 96 } else { 97 panic("bremfree: removing a buffer when not on a queue"); 98 } 99 splx(s); 100 } 101 102 /* 103 * Get a buffer with the specified data. Look in the cache first. 104 */ 105 int 106 bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred, 107 struct buf **bpp) 108 { 109 struct buf *bp; 110 111 bp = getblk (vp, blkno, size, 0, 0); 112 *bpp = bp; 113 114 /* if not found in cache, do some I/O */ 115 if ((bp->b_flags & B_CACHE) == 0) { 116 if (curproc && curproc->p_stats) /* count block I/O */ 117 curproc->p_stats->p_ru.ru_inblock++; 118 bp->b_flags |= B_READ; 119 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 120 if( bp->b_rcred == NOCRED) { 121 if (cred != NOCRED) 122 crhold(cred); 123 bp->b_rcred = cred; 124 } 125 VOP_STRATEGY(bp); 126 return( biowait (bp)); 127 } 128 129 return (0); 130 } 131 132 /* 133 * Operates like bread, but also starts asynchronous I/O on 134 * read-ahead blocks. 135 */ 136 int 137 breadn(struct vnode *vp, daddr_t blkno, int size, 138 daddr_t *rablkno, int *rabsize, 139 int cnt, struct ucred *cred, struct buf **bpp) 140 { 141 struct buf *bp, *rabp; 142 int i; 143 int rv = 0, readwait = 0; 144 145 *bpp = bp = getblk (vp, blkno, size, 0, 0); 146 147 /* if not found in cache, do some I/O */ 148 if ((bp->b_flags & B_CACHE) == 0) { 149 if (curproc && curproc->p_stats) /* count block I/O */ 150 curproc->p_stats->p_ru.ru_inblock++; 151 bp->b_flags |= B_READ; 152 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 153 if( bp->b_rcred == NOCRED) { 154 if (cred != NOCRED) 155 crhold(cred); 156 bp->b_rcred = cred; 157 } 158 VOP_STRATEGY(bp); 159 ++readwait; 160 } 161 162 for(i=0;i<cnt;i++, rablkno++, rabsize++) { 163 if( incore(vp, *rablkno)) { 164 continue; 165 } 166 rabp = getblk (vp, *rablkno, *rabsize, 0, 0); 167 168 if ((rabp->b_flags & B_CACHE) == 0) { 169 if (curproc && curproc->p_stats) 170 curproc->p_stats->p_ru.ru_inblock++; 171 rabp->b_flags |= B_READ | B_ASYNC; 172 rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 173 if( rabp->b_rcred == NOCRED) { 174 if (cred != NOCRED) 175 crhold(cred); 176 rabp->b_rcred = cred; 177 } 178 VOP_STRATEGY(rabp); 179 } else { 180 brelse(rabp); 181 } 182 } 183 184 if( readwait) { 185 rv = biowait (bp); 186 } 187 188 return (rv); 189 } 190 191 /* 192 * Write, release buffer on completion. (Done by iodone 193 * if async.) 194 */ 195 int 196 bwrite(struct buf *bp) 197 { 198 int oldflags = bp->b_flags; 199 200 if(bp->b_flags & B_INVAL) { 201 brelse(bp); 202 return (0); 203 } 204 205 if(!(bp->b_flags & B_BUSY)) 206 panic("bwrite: buffer is not busy???"); 207 208 bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); 209 bp->b_flags |= B_WRITEINPROG; 210 211 if (oldflags & B_ASYNC) { 212 if (oldflags & B_DELWRI) { 213 reassignbuf(bp, bp->b_vp); 214 } else if( curproc) { 215 ++curproc->p_stats->p_ru.ru_oublock; 216 } 217 } 218 219 bp->b_vp->v_numoutput++; 220 VOP_STRATEGY(bp); 221 222 if( (oldflags & B_ASYNC) == 0) { 223 int rtval = biowait(bp); 224 if (oldflags & B_DELWRI) { 225 reassignbuf(bp, bp->b_vp); 226 } else if( curproc) { 227 ++curproc->p_stats->p_ru.ru_oublock; 228 } 229 brelse(bp); 230 return (rtval); 231 } 232 233 return(0); 234 } 235 236 int 237 vn_bwrite(ap) 238 struct vop_bwrite_args *ap; 239 { 240 return (bwrite(ap->a_bp)); 241 } 242 243 /* 244 * Delayed write. (Buffer is marked dirty). 245 */ 246 void 247 bdwrite(struct buf *bp) 248 { 249 250 if((bp->b_flags & B_BUSY) == 0) { 251 panic("bdwrite: buffer is not busy"); 252 } 253 254 if(bp->b_flags & B_INVAL) { 255 brelse(bp); 256 return; 257 } 258 259 if(bp->b_flags & B_TAPE) { 260 bawrite(bp); 261 return; 262 } 263 264 bp->b_flags &= ~B_READ; 265 if( (bp->b_flags & B_DELWRI) == 0) { 266 if( curproc) 267 ++curproc->p_stats->p_ru.ru_oublock; 268 bp->b_flags |= B_DONE|B_DELWRI; 269 reassignbuf(bp, bp->b_vp); 270 } 271 brelse(bp); 272 return; 273 } 274 275 /* 276 * Asynchronous write. 277 * Start output on a buffer, but do not wait for it to complete. 278 * The buffer is released when the output completes. 279 */ 280 void 281 bawrite(struct buf *bp) 282 { 283 bp->b_flags |= B_ASYNC; 284 (void) bwrite(bp); 285 } 286 287 /* 288 * Release a buffer. 289 */ 290 void 291 brelse(struct buf *bp) 292 { 293 int x; 294 295 /* anyone need a "free" block? */ 296 x=splbio(); 297 if (needsbuffer) { 298 needsbuffer = 0; 299 wakeup((caddr_t)&needsbuffer); 300 } 301 /* anyone need this very block? */ 302 if (bp->b_flags & B_WANTED) { 303 bp->b_flags &= ~(B_WANTED|B_AGE); 304 wakeup((caddr_t)bp); 305 } 306 307 if (bp->b_flags & B_LOCKED) 308 bp->b_flags &= ~B_ERROR; 309 310 if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) || 311 (bp->b_bufsize <= 0)) { 312 bp->b_flags |= B_INVAL; 313 bp->b_flags &= ~(B_DELWRI|B_CACHE); 314 if(bp->b_vp) 315 brelvp(bp); 316 } 317 318 if( bp->b_qindex != QUEUE_NONE) 319 panic("brelse: free buffer onto another queue???"); 320 321 /* enqueue */ 322 /* buffers with junk contents */ 323 if(bp->b_bufsize == 0) { 324 bp->b_qindex = QUEUE_EMPTY; 325 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 326 LIST_REMOVE(bp, b_hash); 327 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 328 bp->b_dev = NODEV; 329 } else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) { 330 bp->b_qindex = QUEUE_AGE; 331 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 332 LIST_REMOVE(bp, b_hash); 333 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 334 bp->b_dev = NODEV; 335 /* buffers that are locked */ 336 } else if(bp->b_flags & B_LOCKED) { 337 bp->b_qindex = QUEUE_LOCKED; 338 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 339 /* buffers with stale but valid contents */ 340 } else if(bp->b_flags & B_AGE) { 341 bp->b_qindex = QUEUE_AGE; 342 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 343 /* buffers with valid and quite potentially reuseable contents */ 344 } else { 345 bp->b_qindex = QUEUE_LRU; 346 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 347 } 348 349 /* unlock */ 350 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE); 351 splx(x); 352 } 353 354 int freebufspace; 355 int allocbufspace; 356 357 /* 358 * Find a buffer header which is available for use. 359 */ 360 struct buf * 361 getnewbuf(int slpflag, int slptimeo) 362 { 363 struct buf *bp; 364 int x; 365 x = splbio(); 366 start: 367 /* can we constitute a new buffer? */ 368 if (bp = bufqueues[QUEUE_EMPTY].tqh_first) { 369 if( bp->b_qindex != QUEUE_EMPTY) 370 panic("getnewbuf: inconsistent EMPTY queue"); 371 bremfree(bp); 372 goto fillbuf; 373 } 374 375 tryfree: 376 if (bp = bufqueues[QUEUE_AGE].tqh_first) { 377 if( bp->b_qindex != QUEUE_AGE) 378 panic("getnewbuf: inconsistent AGE queue"); 379 bremfree(bp); 380 } else if (bp = bufqueues[QUEUE_LRU].tqh_first) { 381 if( bp->b_qindex != QUEUE_LRU) 382 panic("getnewbuf: inconsistent LRU queue"); 383 bremfree(bp); 384 } else { 385 /* wait for a free buffer of any kind */ 386 needsbuffer = 1; 387 tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0); 388 splx(x); 389 return (0); 390 } 391 392 393 /* if we are a delayed write, convert to an async write */ 394 if (bp->b_flags & B_DELWRI) { 395 bp->b_flags |= B_BUSY; 396 bawrite (bp); 397 goto start; 398 } 399 400 if(bp->b_vp) 401 brelvp(bp); 402 403 /* we are not free, nor do we contain interesting data */ 404 if (bp->b_rcred != NOCRED) 405 crfree(bp->b_rcred); 406 if (bp->b_wcred != NOCRED) 407 crfree(bp->b_wcred); 408 fillbuf: 409 bp->b_flags = B_BUSY; 410 LIST_REMOVE(bp, b_hash); 411 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 412 splx(x); 413 bp->b_dev = NODEV; 414 bp->b_vp = NULL; 415 bp->b_blkno = bp->b_lblkno = 0; 416 bp->b_iodone = 0; 417 bp->b_error = 0; 418 bp->b_resid = 0; 419 bp->b_bcount = 0; 420 bp->b_wcred = bp->b_rcred = NOCRED; 421 bp->b_dirtyoff = bp->b_dirtyend = 0; 422 bp->b_validoff = bp->b_validend = 0; 423 return (bp); 424 } 425 426 /* 427 * Check to see if a block is currently memory resident. 428 */ 429 struct buf * 430 incore(struct vnode *vp, daddr_t blkno) 431 { 432 struct buf *bp; 433 struct bufhashhdr *bh; 434 435 int s = splbio(); 436 437 bh = BUFHASH(vp, blkno); 438 bp = bh->lh_first; 439 440 /* Search hash chain */ 441 while (bp) { 442 if( (bp < buf) || (bp >= buf + nbuf)) { 443 printf("incore: buf out of range: %lx, hash: %d\n", 444 bp, bh - bufhashtbl); 445 panic("incore: buf fault"); 446 } 447 /* hit */ 448 if (bp->b_lblkno == blkno && bp->b_vp == vp 449 && (bp->b_flags & B_INVAL) == 0) { 450 splx(s); 451 return (bp); 452 } 453 bp = bp->b_hash.le_next; 454 } 455 splx(s); 456 457 return(0); 458 } 459 460 /* 461 * Get a block given a specified block and offset into a file/device. 462 */ 463 struct buf * 464 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 465 { 466 struct buf *bp; 467 int x; 468 struct bufhashhdr *bh; 469 470 x = splbio(); 471 loop: 472 if (bp = incore(vp, blkno)) { 473 if (bp->b_flags & B_BUSY) { 474 bp->b_flags |= B_WANTED; 475 tsleep ((caddr_t)bp, PRIBIO, "getblk", 0); 476 goto loop; 477 } 478 bp->b_flags |= B_BUSY | B_CACHE; 479 bremfree(bp); 480 /* 481 * check for size inconsistancies 482 */ 483 if (bp->b_bcount != size) { 484 printf("getblk: invalid buffer size: %d\n", bp->b_bcount); 485 bp->b_flags |= B_INVAL; 486 bwrite(bp); 487 goto loop; 488 } 489 } else { 490 491 if ((bp = getnewbuf(0, 0)) == 0) 492 goto loop; 493 allocbuf(bp, size); 494 /* 495 * have to check again, because of a possible 496 * race condition. 497 */ 498 if (incore( vp, blkno)) { 499 allocbuf(bp, 0); 500 bp->b_flags |= B_INVAL; 501 brelse(bp); 502 goto loop; 503 } 504 bp->b_blkno = bp->b_lblkno = blkno; 505 bgetvp(vp, bp); 506 LIST_REMOVE(bp, b_hash); 507 bh = BUFHASH(vp, blkno); 508 LIST_INSERT_HEAD(bh, bp, b_hash); 509 } 510 splx(x); 511 return (bp); 512 } 513 514 /* 515 * Get an empty, disassociated buffer of given size. 516 */ 517 struct buf * 518 geteblk(int size) 519 { 520 struct buf *bp; 521 while ((bp = getnewbuf(0, 0)) == 0) 522 ; 523 allocbuf(bp, size); 524 bp->b_flags |= B_INVAL; 525 return (bp); 526 } 527 528 /* 529 * Modify the length of a buffer's underlying buffer storage without 530 * destroying information (unless, of course the buffer is shrinking). 531 */ 532 void 533 allocbuf(struct buf *bp, int size) 534 { 535 536 int newbsize = round_page(size); 537 538 if( newbsize == bp->b_bufsize) { 539 bp->b_bcount = size; 540 return; 541 } else if( newbsize < bp->b_bufsize) { 542 vm_hold_free_pages( 543 (vm_offset_t) bp->b_data + newbsize, 544 (vm_offset_t) bp->b_data + bp->b_bufsize); 545 } else if( newbsize > bp->b_bufsize) { 546 vm_hold_load_pages( 547 (vm_offset_t) bp->b_data + bp->b_bufsize, 548 (vm_offset_t) bp->b_data + newbsize); 549 } 550 551 /* adjust buffer cache's idea of memory allocated to buffer contents */ 552 freebufspace -= newbsize - bp->b_bufsize; 553 allocbufspace += newbsize - bp->b_bufsize; 554 555 bp->b_bufsize = newbsize; 556 bp->b_bcount = size; 557 } 558 559 /* 560 * Wait for buffer I/O completion, returning error status. 561 */ 562 int 563 biowait(register struct buf *bp) 564 { 565 int x; 566 567 x = splbio(); 568 while ((bp->b_flags & B_DONE) == 0) 569 tsleep((caddr_t)bp, PRIBIO, "biowait", 0); 570 if((bp->b_flags & B_ERROR) || bp->b_error) { 571 if ((bp->b_flags & B_INVAL) == 0) { 572 bp->b_flags |= B_INVAL; 573 bp->b_dev = NODEV; 574 LIST_REMOVE(bp, b_hash); 575 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 576 } 577 if (!bp->b_error) 578 bp->b_error = EIO; 579 else 580 bp->b_flags |= B_ERROR; 581 splx(x); 582 return (bp->b_error); 583 } else { 584 splx(x); 585 return (0); 586 } 587 } 588 589 /* 590 * Finish I/O on a buffer, calling an optional function. 591 * This is usually called from interrupt level, so process blocking 592 * is not *a good idea*. 593 */ 594 void 595 biodone(register struct buf *bp) 596 { 597 int s; 598 s = splbio(); 599 bp->b_flags |= B_DONE; 600 601 if ((bp->b_flags & B_READ) == 0) { 602 vwakeup(bp); 603 } 604 605 /* call optional completion function if requested */ 606 if (bp->b_flags & B_CALL) { 607 bp->b_flags &= ~B_CALL; 608 (*bp->b_iodone)(bp); 609 splx(s); 610 return; 611 } 612 613 /* 614 * For asynchronous completions, release the buffer now. The brelse 615 * checks for B_WANTED and will do the wakeup there if necessary - 616 * so no need to do a wakeup here in the async case. 617 */ 618 619 if (bp->b_flags & B_ASYNC) { 620 brelse(bp); 621 } else { 622 bp->b_flags &= ~B_WANTED; 623 wakeup((caddr_t) bp); 624 } 625 splx(s); 626 } 627 628 int 629 count_lock_queue() 630 { 631 int count; 632 struct buf *bp; 633 634 count = 0; 635 for(bp = bufqueues[QUEUE_LOCKED].tqh_first; 636 bp != NULL; 637 bp = bp->b_freelist.tqe_next) 638 count++; 639 return(count); 640 } 641 642 #ifndef UPDATE_INTERVAL 643 int vfs_update_interval = 30; 644 #else 645 int vfs_update_interval = UPDATE_INTERVAL; 646 #endif 647 648 void 649 vfs_update() { 650 (void) spl0(); 651 while(1) { 652 tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update", 653 hz * vfs_update_interval); 654 vfs_update_wakeup = 0; 655 sync(curproc, NULL, NULL); 656 } 657 } 658 659 /* 660 * these routines are not in the correct place (yet) 661 * also they work *ONLY* for kernel_pmap!!! 662 */ 663 void 664 vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) { 665 vm_offset_t pg; 666 vm_page_t p; 667 vm_offset_t from = round_page(froma); 668 vm_offset_t to = round_page(toa); 669 670 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 671 vm_offset_t pa; 672 673 tryagain: 674 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS); 675 if( !p) { 676 VM_WAIT; 677 goto tryagain; 678 } 679 680 vm_page_wire(p); 681 pmap_enter(kernel_pmap, pg, VM_PAGE_TO_PHYS(p), 682 VM_PROT_READ|VM_PROT_WRITE, 1); 683 } 684 } 685 686 void 687 vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) { 688 vm_offset_t pg; 689 vm_page_t p; 690 vm_offset_t from = round_page(froma); 691 vm_offset_t to = round_page(toa); 692 693 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 694 vm_offset_t pa; 695 pa = pmap_kextract(pg); 696 if( !pa) { 697 printf("No pa for va: %x\n", pg); 698 } else { 699 p = PHYS_TO_VM_PAGE( pa); 700 pmap_remove(kernel_pmap, pg, pg + PAGE_SIZE); 701 vm_page_free(p); 702 } 703 } 704 } 705 706 void 707 bufstats() 708 { 709 } 710 711