1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/t_lock.h> 41 #include <sys/sysmacros.h> 42 #include <sys/conf.h> 43 #include <sys/cpuvar.h> 44 #include <sys/errno.h> 45 #include <sys/debug.h> 46 #include <sys/buf.h> 47 #include <sys/var.h> 48 #include <sys/vnode.h> 49 #include <sys/bitmap.h> 50 #include <sys/cmn_err.h> 51 #include <sys/kmem.h> 52 #include <sys/vmem.h> 53 #include <sys/atomic.h> 54 #include <vm/seg_kmem.h> 55 #include <vm/page.h> 56 #include <vm/pvn.h> 57 #include <sys/vtrace.h> 58 #include <sys/tnf_probe.h> 59 #include <sys/fs/ufs_inode.h> 60 #include <sys/fs/ufs_bio.h> 61 #include <sys/fs/ufs_log.h> 62 #include <sys/systm.h> 63 #include <sys/vfs.h> 64 #include <sys/sdt.h> 65 66 /* Locks */ 67 static kmutex_t blist_lock; /* protects b_list */ 68 static kmutex_t bhdr_lock; /* protects the bhdrlist */ 69 static kmutex_t bfree_lock; /* protects the bfreelist structure */ 70 71 struct hbuf *hbuf; /* Hash buckets */ 72 struct dwbuf *dwbuf; /* Delayed write buckets */ 73 static struct buf *bhdrlist; /* buf header free list */ 74 static int nbuf; /* number of buffer headers allocated */ 75 76 static int lastindex; /* Reference point on where to start */ 77 /* when looking for free buffers */ 78 79 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask) 80 #define EMPTY_LIST ((struct buf *)-1) 81 82 static kcondvar_t bio_mem_cv; /* Condition variables */ 83 static kcondvar_t bio_flushinval_cv; 84 static int bio_doingflush; /* flush in progress */ 85 static int bio_doinginval; /* inval in progress */ 86 static int bio_flinv_cv_wanted; /* someone waiting for cv */ 87 88 /* 89 * Statistics on the buffer cache 90 */ 91 struct biostats biostats = { 92 { "buffer_cache_lookups", KSTAT_DATA_UINT32 }, 93 { "buffer_cache_hits", KSTAT_DATA_UINT32 }, 94 { "new_buffer_requests", KSTAT_DATA_UINT32 }, 95 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 }, 96 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 }, 97 { "duplicate_buffers_found", KSTAT_DATA_UINT32 } 98 }; 99 100 /* 101 * kstat data 102 */ 103 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats; 104 uint_t biostats_ndata = (uint_t)(sizeof (biostats) / 105 sizeof (kstat_named_t)); 106 107 /* 108 * Statistics on ufs buffer cache 109 * Not protected by locks 110 */ 111 struct ufsbiostats ub = { 112 { "breads", KSTAT_DATA_UINT32 }, 113 { "bwrites", KSTAT_DATA_UINT32 }, 114 { "fbiwrites", KSTAT_DATA_UINT32 }, 115 { "getpages", KSTAT_DATA_UINT32 }, 116 { "getras", KSTAT_DATA_UINT32 }, 117 { "putsyncs", KSTAT_DATA_UINT32 }, 118 { "putasyncs", KSTAT_DATA_UINT32 }, 119 { "putpageios", KSTAT_DATA_UINT32 }, 120 }; 121 122 /* 123 * more UFS Logging eccentricities... 124 * 125 * required since "#pragma weak ..." doesn't work in reverse order. 126 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers 127 * to ufs routines don't get plugged into bio.c calls so 128 * we initialize it when setting up the "lufsops" table 129 * in "lufs.c:_init()" 130 */ 131 void (*bio_lufs_strategy)(void *, buf_t *); 132 void (*bio_snapshot_strategy)(void *, buf_t *); 133 134 135 /* Private routines */ 136 static struct buf *bio_getfreeblk(long); 137 static void bio_mem_get(long); 138 static void bio_bhdr_free(struct buf *); 139 static struct buf *bio_bhdr_alloc(void); 140 static void bio_recycle(int, long); 141 static void bio_pageio_done(struct buf *); 142 static int bio_incore(dev_t, daddr_t); 143 144 /* 145 * Buffer cache constants 146 */ 147 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */ 148 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */ 149 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */ 150 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */ 151 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024) 152 #define BIO_HASHLEN 4 /* Target length of hash chains */ 153 154 155 /* Flags for bio_recycle() */ 156 #define BIO_HEADER 0x01 157 #define BIO_MEM 0x02 158 159 extern int bufhwm; /* User tunable - high water mark for mem */ 160 extern int bufhwm_pct; /* ditto - given in % of physmem */ 161 162 /* 163 * The following routines allocate and free 164 * buffers with various side effects. In general the 165 * arguments to an allocate routine are a device and 166 * a block number, and the value is a pointer to 167 * to the buffer header; the buffer returned is locked with a 168 * binary semaphore so that no one else can touch it. If the block was 169 * already in core, no I/O need be done; if it is 170 * already locked, the process waits until it becomes free. 171 * The following routines allocate a buffer: 172 * getblk 173 * bread/BREAD 174 * breada 175 * Eventually the buffer must be released, possibly with the 176 * side effect of writing it out, by using one of 177 * bwrite/BWRITE/brwrite 178 * bdwrite/bdrwrite 179 * bawrite 180 * brelse 181 * 182 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization. 183 * Instead, a binary semaphore, b_sem is used to gain exclusive access to 184 * a buffer and a binary semaphore, b_io is used for I/O synchronization. 185 * B_DONE is still used to denote a buffer with I/O complete on it. 186 * 187 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is 188 * should not be used where a very accurate count of the free buffers is 189 * needed. 190 */ 191 192 /* 193 * Read in (if necessary) the block and return a buffer pointer. 194 * 195 * This interface is provided for binary compatibility. Using 196 * BREAD() directly avoids the extra function call overhead invoked 197 * by calling this routine. 198 */ 199 struct buf * 200 bread(dev_t dev, daddr_t blkno, long bsize) 201 { 202 return (BREAD(dev, blkno, bsize)); 203 } 204 205 /* 206 * Common code for reading a buffer with various options 207 * 208 * Read in (if necessary) the block and return a buffer pointer. 209 */ 210 struct buf * 211 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize) 212 { 213 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 214 struct buf *bp; 215 klwp_t *lwp = ttolwp(curthread); 216 217 CPU_STATS_ADD_K(sys, lread, 1); 218 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1); 219 if (bp->b_flags & B_DONE) 220 return (bp); 221 bp->b_flags |= B_READ; 222 ASSERT(bp->b_bcount == bsize); 223 if (ufsvfsp == NULL) { /* !ufs */ 224 (void) bdev_strategy(bp); 225 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 226 /* ufs && logging */ 227 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 228 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 229 /* ufs && snapshots */ 230 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 231 } else { 232 ufsvfsp->vfs_iotstamp = ddi_get_lbolt(); 233 ub.ub_breads.value.ul++; /* ufs && !logging */ 234 (void) bdev_strategy(bp); 235 } 236 if (lwp != NULL) 237 lwp->lwp_ru.inblock++; 238 CPU_STATS_ADD_K(sys, bread, 1); 239 (void) biowait(bp); 240 return (bp); 241 } 242 243 /* 244 * Read in the block, like bread, but also start I/O on the 245 * read-ahead block (which is not allocated to the caller). 246 */ 247 struct buf * 248 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize) 249 { 250 struct buf *bp, *rabp; 251 klwp_t *lwp = ttolwp(curthread); 252 253 bp = NULL; 254 if (!bio_incore(dev, blkno)) { 255 CPU_STATS_ADD_K(sys, lread, 1); 256 bp = GETBLK(dev, blkno, bsize); 257 if ((bp->b_flags & B_DONE) == 0) { 258 bp->b_flags |= B_READ; 259 bp->b_bcount = bsize; 260 (void) bdev_strategy(bp); 261 if (lwp != NULL) 262 lwp->lwp_ru.inblock++; 263 CPU_STATS_ADD_K(sys, bread, 1); 264 } 265 } 266 if (rablkno && bfreelist.b_bcount > 1 && 267 !bio_incore(dev, rablkno)) { 268 rabp = GETBLK(dev, rablkno, bsize); 269 if (rabp->b_flags & B_DONE) 270 brelse(rabp); 271 else { 272 rabp->b_flags |= B_READ|B_ASYNC; 273 rabp->b_bcount = bsize; 274 (void) bdev_strategy(rabp); 275 if (lwp != NULL) 276 lwp->lwp_ru.inblock++; 277 CPU_STATS_ADD_K(sys, bread, 1); 278 } 279 } 280 if (bp == NULL) 281 return (BREAD(dev, blkno, bsize)); 282 (void) biowait(bp); 283 return (bp); 284 } 285 286 /* 287 * Common code for writing a buffer with various options. 288 * 289 * force_wait - wait for write completion regardless of B_ASYNC flag 290 * do_relse - release the buffer when we are done 291 * clear_flags - flags to clear from the buffer 292 */ 293 void 294 bwrite_common(void *arg, struct buf *bp, int force_wait, 295 int do_relse, int clear_flags) 296 { 297 register int do_wait; 298 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 299 int flag; 300 klwp_t *lwp = ttolwp(curthread); 301 struct cpu *cpup; 302 303 ASSERT(SEMA_HELD(&bp->b_sem)); 304 flag = bp->b_flags; 305 bp->b_flags &= ~clear_flags; 306 if (lwp != NULL) 307 lwp->lwp_ru.oublock++; 308 CPU_STATS_ENTER_K(); 309 cpup = CPU; /* get pointer AFTER preemption is disabled */ 310 CPU_STATS_ADDQ(cpup, sys, lwrite, 1); 311 CPU_STATS_ADDQ(cpup, sys, bwrite, 1); 312 do_wait = ((flag & B_ASYNC) == 0 || force_wait); 313 if (do_wait == 0) 314 CPU_STATS_ADDQ(cpup, sys, bawrite, 1); 315 CPU_STATS_EXIT_K(); 316 if (ufsvfsp == NULL) { 317 (void) bdev_strategy(bp); 318 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 319 /* ufs && logging */ 320 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 321 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 322 /* ufs && snapshots */ 323 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 324 } else { 325 ub.ub_bwrites.value.ul++; /* ufs && !logging */ 326 (void) bdev_strategy(bp); 327 } 328 if (do_wait) { 329 (void) biowait(bp); 330 if (do_relse) { 331 brelse(bp); 332 } 333 } 334 } 335 336 /* 337 * Write the buffer, waiting for completion (unless B_ASYNC is set). 338 * Then release the buffer. 339 * This interface is provided for binary compatibility. Using 340 * BWRITE() directly avoids the extra function call overhead invoked 341 * by calling this routine. 342 */ 343 void 344 bwrite(struct buf *bp) 345 { 346 BWRITE(bp); 347 } 348 349 /* 350 * Write the buffer, waiting for completion. 351 * But don't release the buffer afterwards. 352 * This interface is provided for binary compatibility. Using 353 * BWRITE2() directly avoids the extra function call overhead. 354 */ 355 void 356 bwrite2(struct buf *bp) 357 { 358 BWRITE2(bp); 359 } 360 361 /* 362 * Release the buffer, marking it so that if it is grabbed 363 * for another purpose it will be written out before being 364 * given up (e.g. when writing a partial block where it is 365 * assumed that another write for the same block will soon follow). 366 * Also save the time that the block is first marked as delayed 367 * so that it will be written in a reasonable time. 368 */ 369 void 370 bdwrite(struct buf *bp) 371 { 372 ASSERT(SEMA_HELD(&bp->b_sem)); 373 CPU_STATS_ADD_K(sys, lwrite, 1); 374 if ((bp->b_flags & B_DELWRI) == 0) 375 bp->b_start = ddi_get_lbolt(); 376 /* 377 * B_DONE allows others to use the buffer, B_DELWRI causes the 378 * buffer to be written before being reused, and setting b_resid 379 * to zero says the buffer is complete. 380 */ 381 bp->b_flags |= B_DELWRI | B_DONE; 382 bp->b_resid = 0; 383 brelse(bp); 384 } 385 386 /* 387 * Release the buffer, start I/O on it, but don't wait for completion. 388 */ 389 void 390 bawrite(struct buf *bp) 391 { 392 ASSERT(SEMA_HELD(&bp->b_sem)); 393 394 /* Use bfreelist.b_bcount as a weird-ass heuristic */ 395 if (bfreelist.b_bcount > 4) 396 bp->b_flags |= B_ASYNC; 397 BWRITE(bp); 398 } 399 400 /* 401 * Release the buffer, with no I/O implied. 402 */ 403 void 404 brelse(struct buf *bp) 405 { 406 struct buf **backp; 407 uint_t index; 408 kmutex_t *hmp; 409 struct buf *dp; 410 struct hbuf *hp; 411 412 413 ASSERT(SEMA_HELD(&bp->b_sem)); 414 415 /* 416 * Clear the retry write flag if the buffer was written without 417 * error. The presence of B_DELWRI means the buffer has not yet 418 * been written and the presence of B_ERROR means that an error 419 * is still occurring. 420 */ 421 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) { 422 bp->b_flags &= ~B_RETRYWRI; 423 } 424 425 /* Check for anomalous conditions */ 426 if (bp->b_flags & (B_ERROR|B_NOCACHE)) { 427 if (bp->b_flags & B_NOCACHE) { 428 /* Don't add to the freelist. Destroy it now */ 429 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 430 sema_destroy(&bp->b_sem); 431 sema_destroy(&bp->b_io); 432 kmem_free(bp, sizeof (struct buf)); 433 return; 434 } 435 /* 436 * If a write failed and we are supposed to retry write, 437 * don't toss the buffer. Keep it around and mark it 438 * delayed write in the hopes that it will eventually 439 * get flushed (and still keep the system running.) 440 */ 441 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) { 442 bp->b_flags |= B_DELWRI; 443 /* keep fsflush from trying continuously to flush */ 444 bp->b_start = ddi_get_lbolt(); 445 } else 446 bp->b_flags |= B_AGE|B_STALE; 447 bp->b_flags &= ~B_ERROR; 448 bp->b_error = 0; 449 } 450 451 /* 452 * If delayed write is set then put in on the delayed 453 * write list instead of the free buffer list. 454 */ 455 index = bio_bhash(bp->b_edev, bp->b_blkno); 456 hmp = &hbuf[index].b_lock; 457 458 mutex_enter(hmp); 459 hp = &hbuf[index]; 460 dp = (struct buf *)hp; 461 462 /* 463 * Make sure that the number of entries on this list are 464 * Zero <= count <= total # buffers 465 */ 466 ASSERT(hp->b_length >= 0); 467 ASSERT(hp->b_length < nbuf); 468 469 hp->b_length++; /* We are adding this buffer */ 470 471 if (bp->b_flags & B_DELWRI) { 472 /* 473 * This buffer goes on the delayed write buffer list 474 */ 475 dp = (struct buf *)&dwbuf[index]; 476 } 477 ASSERT(bp->b_bufsize > 0); 478 ASSERT(bp->b_bcount > 0); 479 ASSERT(bp->b_un.b_addr != NULL); 480 481 if (bp->b_flags & B_AGE) { 482 backp = &dp->av_forw; 483 (*backp)->av_back = bp; 484 bp->av_forw = *backp; 485 *backp = bp; 486 bp->av_back = dp; 487 } else { 488 backp = &dp->av_back; 489 (*backp)->av_forw = bp; 490 bp->av_back = *backp; 491 *backp = bp; 492 bp->av_forw = dp; 493 } 494 mutex_exit(hmp); 495 496 if (bfreelist.b_flags & B_WANTED) { 497 /* 498 * Should come here very very rarely. 499 */ 500 mutex_enter(&bfree_lock); 501 if (bfreelist.b_flags & B_WANTED) { 502 bfreelist.b_flags &= ~B_WANTED; 503 cv_broadcast(&bio_mem_cv); 504 } 505 mutex_exit(&bfree_lock); 506 } 507 508 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC); 509 /* 510 * Don't let anyone get the buffer off the freelist before we 511 * release our hold on it. 512 */ 513 sema_v(&bp->b_sem); 514 } 515 516 /* 517 * Return a count of the number of B_BUSY buffers in the system 518 * Can only be used as a good estimate. If 'cleanit' is set, 519 * try to flush all bufs. 520 */ 521 int 522 bio_busy(int cleanit) 523 { 524 struct buf *bp, *dp; 525 int busy = 0; 526 int i; 527 kmutex_t *hmp; 528 529 for (i = 0; i < v.v_hbuf; i++) { 530 vfs_syncprogress(); 531 dp = (struct buf *)&hbuf[i]; 532 hmp = &hbuf[i].b_lock; 533 534 mutex_enter(hmp); 535 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 536 if (bp->b_flags & B_BUSY) 537 busy++; 538 } 539 mutex_exit(hmp); 540 } 541 542 if (cleanit && busy != 0) { 543 bflush(NODEV); 544 } 545 546 return (busy); 547 } 548 549 /* 550 * this interface is provided for binary compatibility. 551 * 552 * Assign a buffer for the given block. If the appropriate 553 * block is already associated, return it; otherwise search 554 * for the oldest non-busy buffer and reassign it. 555 */ 556 struct buf * 557 getblk(dev_t dev, daddr_t blkno, long bsize) 558 { 559 return (getblk_common(/* ufsvfsp */ NULL, dev, 560 blkno, bsize, /* errflg */ 0)); 561 } 562 563 /* 564 * Assign a buffer for the given block. If the appropriate 565 * block is already associated, return it; otherwise search 566 * for the oldest non-busy buffer and reassign it. 567 */ 568 struct buf * 569 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg) 570 { 571 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg; 572 struct buf *bp; 573 struct buf *dp; 574 struct buf *nbp = NULL; 575 struct buf *errbp; 576 uint_t index; 577 kmutex_t *hmp; 578 struct hbuf *hp; 579 580 if (getmajor(dev) >= devcnt) 581 cmn_err(CE_PANIC, "blkdev"); 582 583 biostats.bio_lookup.value.ui32++; 584 585 index = bio_bhash(dev, blkno); 586 hp = &hbuf[index]; 587 dp = (struct buf *)hp; 588 hmp = &hp->b_lock; 589 590 mutex_enter(hmp); 591 loop: 592 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 593 if (bp->b_blkno != blkno || bp->b_edev != dev || 594 (bp->b_flags & B_STALE)) 595 continue; 596 /* 597 * Avoid holding the hash lock in the event that 598 * the buffer is locked by someone. Since the hash chain 599 * may change when we drop the hash lock 600 * we have to start at the beginning of the chain if the 601 * buffer identity/contents aren't valid. 602 */ 603 if (!sema_tryp(&bp->b_sem)) { 604 biostats.bio_bufbusy.value.ui32++; 605 mutex_exit(hmp); 606 /* 607 * OK, we are dealing with a busy buffer. 608 * In the case that we are panicking and we 609 * got called from bread(), we have some chance 610 * for error recovery. So better bail out from 611 * here since sema_p() won't block. If we got 612 * called directly from ufs routines, there is 613 * no way to report an error yet. 614 */ 615 if (panicstr && errflg) 616 goto errout; 617 /* 618 * For the following line of code to work 619 * correctly never kmem_free the buffer "header". 620 */ 621 sema_p(&bp->b_sem); 622 if (bp->b_blkno != blkno || bp->b_edev != dev || 623 (bp->b_flags & B_STALE)) { 624 sema_v(&bp->b_sem); 625 mutex_enter(hmp); 626 goto loop; /* start over */ 627 } 628 mutex_enter(hmp); 629 } 630 /* Found */ 631 biostats.bio_hit.value.ui32++; 632 bp->b_flags &= ~B_AGE; 633 634 /* 635 * Yank it off the free/delayed write lists 636 */ 637 hp->b_length--; 638 notavail(bp); 639 mutex_exit(hmp); 640 641 ASSERT((bp->b_flags & B_NOCACHE) == NULL); 642 643 if (nbp == NULL) { 644 /* 645 * Make the common path short. 646 */ 647 ASSERT(SEMA_HELD(&bp->b_sem)); 648 return (bp); 649 } 650 651 biostats.bio_bufdup.value.ui32++; 652 653 /* 654 * The buffer must have entered during the lock upgrade 655 * so free the new buffer we allocated and return the 656 * found buffer. 657 */ 658 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize); 659 nbp->b_un.b_addr = NULL; 660 661 /* 662 * Account for the memory 663 */ 664 mutex_enter(&bfree_lock); 665 bfreelist.b_bufsize += nbp->b_bufsize; 666 mutex_exit(&bfree_lock); 667 668 /* 669 * Destroy buf identity, and place on avail list 670 */ 671 nbp->b_dev = (o_dev_t)NODEV; 672 nbp->b_edev = NODEV; 673 nbp->b_flags = 0; 674 nbp->b_file = NULL; 675 nbp->b_offset = -1; 676 677 sema_v(&nbp->b_sem); 678 bio_bhdr_free(nbp); 679 680 ASSERT(SEMA_HELD(&bp->b_sem)); 681 return (bp); 682 } 683 684 /* 685 * bio_getfreeblk may block so check the hash chain again. 686 */ 687 if (nbp == NULL) { 688 mutex_exit(hmp); 689 nbp = bio_getfreeblk(bsize); 690 mutex_enter(hmp); 691 goto loop; 692 } 693 694 /* 695 * New buffer. Assign nbp and stick it on the hash. 696 */ 697 nbp->b_flags = B_BUSY; 698 nbp->b_edev = dev; 699 nbp->b_dev = (o_dev_t)cmpdev(dev); 700 nbp->b_blkno = blkno; 701 nbp->b_iodone = NULL; 702 nbp->b_bcount = bsize; 703 /* 704 * If we are given a ufsvfsp and the vfs_root field is NULL 705 * then this must be I/O for a superblock. A superblock's 706 * buffer is set up in mountfs() and there is no root vnode 707 * at that point. 708 */ 709 if (ufsvfsp && ufsvfsp->vfs_root) { 710 nbp->b_vp = ufsvfsp->vfs_root; 711 } else { 712 nbp->b_vp = NULL; 713 } 714 715 ASSERT((nbp->b_flags & B_NOCACHE) == NULL); 716 717 binshash(nbp, dp); 718 mutex_exit(hmp); 719 720 ASSERT(SEMA_HELD(&nbp->b_sem)); 721 722 return (nbp); 723 724 725 /* 726 * Come here in case of an internal error. At this point we couldn't 727 * get a buffer, but he have to return one. Hence we allocate some 728 * kind of error reply buffer on the fly. This buffer is marked as 729 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following: 730 * - B_ERROR will indicate error to the caller. 731 * - B_DONE will prevent us from reading the buffer from 732 * the device. 733 * - B_NOCACHE will cause that this buffer gets free'd in 734 * brelse(). 735 */ 736 737 errout: 738 errbp = geteblk(); 739 sema_p(&errbp->b_sem); 740 errbp->b_flags &= ~B_BUSY; 741 errbp->b_flags |= (B_ERROR | B_DONE); 742 return (errbp); 743 } 744 745 /* 746 * Get an empty block, not assigned to any particular device. 747 * Returns a locked buffer that is not on any hash or free list. 748 */ 749 struct buf * 750 ngeteblk(long bsize) 751 { 752 struct buf *bp; 753 754 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP); 755 bioinit(bp); 756 bp->av_forw = bp->av_back = NULL; 757 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 758 bp->b_bufsize = bsize; 759 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE; 760 bp->b_dev = (o_dev_t)NODEV; 761 bp->b_edev = NODEV; 762 bp->b_lblkno = 0; 763 bp->b_bcount = bsize; 764 bp->b_iodone = NULL; 765 return (bp); 766 } 767 768 /* 769 * Interface of geteblk() is kept intact to maintain driver compatibility. 770 * Use ngeteblk() to allocate block size other than 1 KB. 771 */ 772 struct buf * 773 geteblk(void) 774 { 775 return (ngeteblk((long)1024)); 776 } 777 778 /* 779 * Return a buffer w/o sleeping 780 */ 781 struct buf * 782 trygetblk(dev_t dev, daddr_t blkno) 783 { 784 struct buf *bp; 785 struct buf *dp; 786 struct hbuf *hp; 787 kmutex_t *hmp; 788 uint_t index; 789 790 index = bio_bhash(dev, blkno); 791 hp = &hbuf[index]; 792 hmp = &hp->b_lock; 793 794 if (!mutex_tryenter(hmp)) 795 return (NULL); 796 797 dp = (struct buf *)hp; 798 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 799 if (bp->b_blkno != blkno || bp->b_edev != dev || 800 (bp->b_flags & B_STALE)) 801 continue; 802 /* 803 * Get access to a valid buffer without sleeping 804 */ 805 if (sema_tryp(&bp->b_sem)) { 806 if (bp->b_flags & B_DONE) { 807 hp->b_length--; 808 notavail(bp); 809 mutex_exit(hmp); 810 return (bp); 811 } else { 812 sema_v(&bp->b_sem); 813 break; 814 } 815 } 816 break; 817 } 818 mutex_exit(hmp); 819 return (NULL); 820 } 821 822 /* 823 * Wait for I/O completion on the buffer; return errors 824 * to the user. 825 */ 826 int 827 iowait(struct buf *bp) 828 { 829 ASSERT(SEMA_HELD(&bp->b_sem)); 830 return (biowait(bp)); 831 } 832 833 /* 834 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 835 * and wake up anyone waiting for it. 836 */ 837 void 838 iodone(struct buf *bp) 839 { 840 ASSERT(SEMA_HELD(&bp->b_sem)); 841 (void) biodone(bp); 842 } 843 844 /* 845 * Zero the core associated with a buffer. 846 */ 847 void 848 clrbuf(struct buf *bp) 849 { 850 ASSERT(SEMA_HELD(&bp->b_sem)); 851 bzero(bp->b_un.b_addr, bp->b_bcount); 852 bp->b_resid = 0; 853 } 854 855 856 /* 857 * Make sure all write-behind blocks on dev (or NODEV for all) 858 * are flushed out. 859 */ 860 void 861 bflush(dev_t dev) 862 { 863 struct buf *bp, *dp; 864 struct hbuf *hp; 865 struct buf *delwri_list = EMPTY_LIST; 866 int i, index; 867 kmutex_t *hmp; 868 869 mutex_enter(&blist_lock); 870 /* 871 * Wait for any invalidates or flushes ahead of us to finish. 872 * We really could split blist_lock up per device for better 873 * parallelism here. 874 */ 875 while (bio_doinginval || bio_doingflush) { 876 bio_flinv_cv_wanted = 1; 877 cv_wait(&bio_flushinval_cv, &blist_lock); 878 } 879 bio_doingflush++; 880 /* 881 * Gather all B_DELWRI buffer for device. 882 * Lock ordering is b_sem > hash lock (brelse). 883 * Since we are finding the buffer via the delayed write list, 884 * it may be busy and we would block trying to get the 885 * b_sem lock while holding hash lock. So transfer all the 886 * candidates on the delwri_list and then drop the hash locks. 887 */ 888 for (i = 0; i < v.v_hbuf; i++) { 889 vfs_syncprogress(); 890 hmp = &hbuf[i].b_lock; 891 dp = (struct buf *)&dwbuf[i]; 892 mutex_enter(hmp); 893 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) { 894 if (dev == NODEV || bp->b_edev == dev) { 895 if (bp->b_list == NULL) { 896 bp->b_list = delwri_list; 897 delwri_list = bp; 898 } 899 } 900 } 901 mutex_exit(hmp); 902 } 903 mutex_exit(&blist_lock); 904 905 /* 906 * Now that the hash locks have been dropped grab the semaphores 907 * and write back all the buffers that have B_DELWRI set. 908 */ 909 while (delwri_list != EMPTY_LIST) { 910 vfs_syncprogress(); 911 bp = delwri_list; 912 913 sema_p(&bp->b_sem); /* may block */ 914 if ((dev != bp->b_edev && dev != NODEV) || 915 (panicstr && bp->b_flags & B_BUSY)) { 916 sema_v(&bp->b_sem); 917 delwri_list = bp->b_list; 918 bp->b_list = NULL; 919 continue; /* No longer a candidate */ 920 } 921 if (bp->b_flags & B_DELWRI) { 922 index = bio_bhash(bp->b_edev, bp->b_blkno); 923 hp = &hbuf[index]; 924 hmp = &hp->b_lock; 925 dp = (struct buf *)hp; 926 927 bp->b_flags |= B_ASYNC; 928 mutex_enter(hmp); 929 hp->b_length--; 930 notavail(bp); 931 mutex_exit(hmp); 932 if (bp->b_vp == NULL) { /* !ufs */ 933 BWRITE(bp); 934 } else { /* ufs */ 935 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 936 } 937 } else { 938 sema_v(&bp->b_sem); 939 } 940 delwri_list = bp->b_list; 941 bp->b_list = NULL; 942 } 943 mutex_enter(&blist_lock); 944 bio_doingflush--; 945 if (bio_flinv_cv_wanted) { 946 bio_flinv_cv_wanted = 0; 947 cv_broadcast(&bio_flushinval_cv); 948 } 949 mutex_exit(&blist_lock); 950 } 951 952 /* 953 * Ensure that a specified block is up-to-date on disk. 954 */ 955 void 956 blkflush(dev_t dev, daddr_t blkno) 957 { 958 struct buf *bp, *dp; 959 struct hbuf *hp; 960 struct buf *sbp = NULL; 961 uint_t index; 962 kmutex_t *hmp; 963 964 index = bio_bhash(dev, blkno); 965 hp = &hbuf[index]; 966 dp = (struct buf *)hp; 967 hmp = &hp->b_lock; 968 969 /* 970 * Identify the buffer in the cache belonging to 971 * this device and blkno (if any). 972 */ 973 mutex_enter(hmp); 974 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 975 if (bp->b_blkno != blkno || bp->b_edev != dev || 976 (bp->b_flags & B_STALE)) 977 continue; 978 sbp = bp; 979 break; 980 } 981 mutex_exit(hmp); 982 if (sbp == NULL) 983 return; 984 /* 985 * Now check the buffer we have identified and 986 * make sure it still belongs to the device and is B_DELWRI 987 */ 988 sema_p(&sbp->b_sem); 989 if (sbp->b_blkno == blkno && sbp->b_edev == dev && 990 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) { 991 mutex_enter(hmp); 992 hp->b_length--; 993 notavail(sbp); 994 mutex_exit(hmp); 995 /* 996 * XXX - There is nothing to guarantee a synchronous 997 * write here if the B_ASYNC flag is set. This needs 998 * some investigation. 999 */ 1000 if (sbp->b_vp == NULL) { /* !ufs */ 1001 BWRITE(sbp); /* synchronous write */ 1002 } else { /* ufs */ 1003 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp); 1004 } 1005 } else { 1006 sema_v(&sbp->b_sem); 1007 } 1008 } 1009 1010 /* 1011 * Same as binval, except can force-invalidate delayed-write buffers 1012 * (which are not be already flushed because of device errors). Also 1013 * makes sure that the retry write flag is cleared. 1014 */ 1015 int 1016 bfinval(dev_t dev, int force) 1017 { 1018 struct buf *dp; 1019 struct buf *bp; 1020 struct buf *binval_list = EMPTY_LIST; 1021 int i, error = 0; 1022 kmutex_t *hmp; 1023 uint_t index; 1024 struct buf **backp; 1025 1026 mutex_enter(&blist_lock); 1027 /* 1028 * Wait for any flushes ahead of us to finish, it's ok to 1029 * do invalidates in parallel. 1030 */ 1031 while (bio_doingflush) { 1032 bio_flinv_cv_wanted = 1; 1033 cv_wait(&bio_flushinval_cv, &blist_lock); 1034 } 1035 bio_doinginval++; 1036 1037 /* Gather bp's */ 1038 for (i = 0; i < v.v_hbuf; i++) { 1039 dp = (struct buf *)&hbuf[i]; 1040 hmp = &hbuf[i].b_lock; 1041 1042 mutex_enter(hmp); 1043 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1044 if (bp->b_edev == dev) { 1045 if (bp->b_list == NULL) { 1046 bp->b_list = binval_list; 1047 binval_list = bp; 1048 } 1049 } 1050 } 1051 mutex_exit(hmp); 1052 } 1053 mutex_exit(&blist_lock); 1054 1055 /* Invalidate all bp's found */ 1056 while (binval_list != EMPTY_LIST) { 1057 bp = binval_list; 1058 1059 sema_p(&bp->b_sem); 1060 if (bp->b_edev == dev) { 1061 if (force && (bp->b_flags & B_DELWRI)) { 1062 /* clear B_DELWRI, move to non-dw freelist */ 1063 index = bio_bhash(bp->b_edev, bp->b_blkno); 1064 hmp = &hbuf[index].b_lock; 1065 dp = (struct buf *)&hbuf[index]; 1066 mutex_enter(hmp); 1067 1068 /* remove from delayed write freelist */ 1069 notavail(bp); 1070 1071 /* add to B_AGE side of non-dw freelist */ 1072 backp = &dp->av_forw; 1073 (*backp)->av_back = bp; 1074 bp->av_forw = *backp; 1075 *backp = bp; 1076 bp->av_back = dp; 1077 1078 /* 1079 * make sure write retries and busy are cleared 1080 */ 1081 bp->b_flags &= 1082 ~(B_BUSY | B_DELWRI | B_RETRYWRI); 1083 mutex_exit(hmp); 1084 } 1085 if ((bp->b_flags & B_DELWRI) == 0) 1086 bp->b_flags |= B_STALE|B_AGE; 1087 else 1088 error = EIO; 1089 } 1090 sema_v(&bp->b_sem); 1091 binval_list = bp->b_list; 1092 bp->b_list = NULL; 1093 } 1094 mutex_enter(&blist_lock); 1095 bio_doinginval--; 1096 if (bio_flinv_cv_wanted) { 1097 cv_broadcast(&bio_flushinval_cv); 1098 bio_flinv_cv_wanted = 0; 1099 } 1100 mutex_exit(&blist_lock); 1101 return (error); 1102 } 1103 1104 /* 1105 * If possible, invalidate blocks for a dev on demand 1106 */ 1107 void 1108 binval(dev_t dev) 1109 { 1110 (void) bfinval(dev, 0); 1111 } 1112 1113 /* 1114 * Initialize the buffer I/O system by freeing 1115 * all buffers and setting all device hash buffer lists to empty. 1116 */ 1117 void 1118 binit(void) 1119 { 1120 struct buf *bp; 1121 unsigned int i, pct; 1122 ulong_t bio_max_hwm, bio_default_hwm; 1123 1124 /* 1125 * Maximum/Default values for bufhwm are set to the smallest of: 1126 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory 1127 * - 1/4 of kernel virtual memory 1128 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int). 1129 * Additionally, in order to allow simple tuning by percentage of 1130 * physical memory, bufhwm_pct is used to calculate the default if 1131 * the value of this tunable is between 0 and BIO_MAX_PERCENT. 1132 * 1133 * Since the unit for v.v_bufhwm is kilobytes, this allows for 1134 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers. 1135 */ 1136 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT, 1137 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1138 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm); 1139 1140 pct = BIO_BUF_PERCENT; 1141 if (bufhwm_pct != 0 && 1142 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) { 1143 pct = BIO_BUF_PERCENT; 1144 /* 1145 * Invalid user specified value, emit a warning. 1146 */ 1147 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \ 1148 range(1..%d). Using %d as default.", 1149 bufhwm_pct, 1150 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT); 1151 } 1152 1153 bio_default_hwm = MIN(physmem / pct, 1154 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1155 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm); 1156 1157 if ((v.v_bufhwm = bufhwm) == 0) 1158 v.v_bufhwm = bio_default_hwm; 1159 1160 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) { 1161 v.v_bufhwm = (int)bio_max_hwm; 1162 /* 1163 * Invalid user specified value, emit a warning. 1164 */ 1165 cmn_err(CE_WARN, 1166 "binit: bufhwm(%d) out \ 1167 of range(%d..%lu). Using %lu as default", 1168 bufhwm, 1169 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm); 1170 } 1171 1172 /* 1173 * Determine the number of hash buckets. Default is to 1174 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers. 1175 * Round up number to the next power of 2. 1176 */ 1177 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) / 1178 BIO_HASHLEN); 1179 v.v_hmask = v.v_hbuf - 1; 1180 v.v_buf = BIO_BHDR_POOL; 1181 1182 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP); 1183 1184 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP); 1185 1186 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024; 1187 bp = &bfreelist; 1188 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp; 1189 1190 for (i = 0; i < v.v_hbuf; i++) { 1191 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i]; 1192 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i]; 1193 1194 /* 1195 * Initialize the delayed write buffer list. 1196 */ 1197 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i]; 1198 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i]; 1199 } 1200 } 1201 1202 /* 1203 * Wait for I/O completion on the buffer; return error code. 1204 * If bp was for synchronous I/O, bp is invalid and associated 1205 * resources are freed on return. 1206 */ 1207 int 1208 biowait(struct buf *bp) 1209 { 1210 int error = 0; 1211 struct cpu *cpup; 1212 1213 ASSERT(SEMA_HELD(&bp->b_sem)); 1214 1215 cpup = CPU; 1216 atomic_inc_64(&cpup->cpu_stats.sys.iowait); 1217 DTRACE_IO1(wait__start, struct buf *, bp); 1218 1219 /* 1220 * In case of panic, busy wait for completion 1221 */ 1222 if (panicstr) { 1223 while ((bp->b_flags & B_DONE) == 0) 1224 drv_usecwait(10); 1225 } else 1226 sema_p(&bp->b_io); 1227 1228 DTRACE_IO1(wait__done, struct buf *, bp); 1229 atomic_dec_64(&cpup->cpu_stats.sys.iowait); 1230 1231 error = geterror(bp); 1232 if ((bp->b_flags & B_ASYNC) == 0) { 1233 if (bp->b_flags & B_REMAPPED) 1234 bp_mapout(bp); 1235 } 1236 return (error); 1237 } 1238 1239 static void 1240 biodone_tnf_probe(struct buf *bp) 1241 { 1242 /* Kernel probe */ 1243 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */, 1244 tnf_device, device, bp->b_edev, 1245 tnf_diskaddr, block, bp->b_lblkno, 1246 tnf_opaque, buf, bp); 1247 } 1248 1249 /* 1250 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 1251 * and wake up anyone waiting for it. 1252 */ 1253 void 1254 biodone(struct buf *bp) 1255 { 1256 if (bp->b_flags & B_STARTED) { 1257 DTRACE_IO1(done, struct buf *, bp); 1258 bp->b_flags &= ~B_STARTED; 1259 } 1260 1261 /* 1262 * Call the TNF probe here instead of the inline code 1263 * to force our compiler to use the tail call optimization. 1264 */ 1265 biodone_tnf_probe(bp); 1266 1267 if (bp->b_iodone != NULL) { 1268 (*(bp->b_iodone))(bp); 1269 return; 1270 } 1271 ASSERT((bp->b_flags & B_DONE) == 0); 1272 ASSERT(SEMA_HELD(&bp->b_sem)); 1273 bp->b_flags |= B_DONE; 1274 if (bp->b_flags & B_ASYNC) { 1275 if (bp->b_flags & (B_PAGEIO|B_REMAPPED)) 1276 bio_pageio_done(bp); 1277 else 1278 brelse(bp); /* release bp to freelist */ 1279 } else { 1280 sema_v(&bp->b_io); 1281 } 1282 } 1283 1284 /* 1285 * Pick up the device's error number and pass it to the user; 1286 * if there is an error but the number is 0 set a generalized code. 1287 */ 1288 int 1289 geterror(struct buf *bp) 1290 { 1291 int error = 0; 1292 1293 ASSERT(SEMA_HELD(&bp->b_sem)); 1294 if (bp->b_flags & B_ERROR) { 1295 error = bp->b_error; 1296 if (!error) 1297 error = EIO; 1298 } 1299 return (error); 1300 } 1301 1302 /* 1303 * Support for pageio buffers. 1304 * 1305 * This stuff should be generalized to provide a generalized bp 1306 * header facility that can be used for things other than pageio. 1307 */ 1308 1309 /* 1310 * Allocate and initialize a buf struct for use with pageio. 1311 */ 1312 struct buf * 1313 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) 1314 { 1315 struct buf *bp; 1316 struct cpu *cpup; 1317 1318 if (flags & B_READ) { 1319 CPU_STATS_ENTER_K(); 1320 cpup = CPU; /* get pointer AFTER preemption is disabled */ 1321 CPU_STATS_ADDQ(cpup, vm, pgin, 1); 1322 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); 1323 if ((flags & B_ASYNC) == 0) { 1324 klwp_t *lwp = ttolwp(curthread); 1325 if (lwp != NULL) 1326 lwp->lwp_ru.majflt++; 1327 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1); 1328 /* Kernel probe */ 1329 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */, 1330 tnf_opaque, vnode, pp->p_vnode, 1331 tnf_offset, offset, pp->p_offset); 1332 } 1333 /* 1334 * Update statistics for pages being paged in 1335 */ 1336 if (pp != NULL && pp->p_vnode != NULL) { 1337 if (IS_SWAPFSVP(pp->p_vnode)) { 1338 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len)); 1339 } else { 1340 if (pp->p_vnode->v_flag & VVMEXEC) { 1341 CPU_STATS_ADDQ(cpup, vm, execpgin, 1342 btopr(len)); 1343 } else { 1344 CPU_STATS_ADDQ(cpup, vm, fspgin, 1345 btopr(len)); 1346 } 1347 } 1348 } 1349 CPU_STATS_EXIT_K(); 1350 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN, 1351 "page_ws_in:pp %p", pp); 1352 /* Kernel probe */ 1353 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */, 1354 tnf_opaque, vnode, pp->p_vnode, 1355 tnf_offset, offset, pp->p_offset, 1356 tnf_size, size, len); 1357 } 1358 1359 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP); 1360 bp->b_bcount = len; 1361 bp->b_bufsize = len; 1362 bp->b_pages = pp; 1363 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags; 1364 bp->b_offset = -1; 1365 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 1366 1367 /* Initialize bp->b_sem in "locked" state */ 1368 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 1369 1370 VN_HOLD(vp); 1371 bp->b_vp = vp; 1372 THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */ 1373 1374 /* 1375 * Caller sets dev & blkno and can adjust 1376 * b_addr for page offset and can use bp_mapin 1377 * to make pages kernel addressable. 1378 */ 1379 return (bp); 1380 } 1381 1382 void 1383 pageio_done(struct buf *bp) 1384 { 1385 ASSERT(SEMA_HELD(&bp->b_sem)); 1386 if (bp->b_flags & B_REMAPPED) 1387 bp_mapout(bp); 1388 VN_RELE(bp->b_vp); 1389 bp->b_vp = NULL; 1390 ASSERT((bp->b_flags & B_NOCACHE) != 0); 1391 1392 /* A sema_v(bp->b_sem) is implied if we are destroying it */ 1393 sema_destroy(&bp->b_sem); 1394 sema_destroy(&bp->b_io); 1395 kmem_free(bp, sizeof (struct buf)); 1396 } 1397 1398 /* 1399 * Check to see whether the buffers, except the one pointed by sbp, 1400 * associated with the device are busy. 1401 * NOTE: This expensive operation shall be improved together with ufs_icheck(). 1402 */ 1403 int 1404 bcheck(dev_t dev, struct buf *sbp) 1405 { 1406 struct buf *bp; 1407 struct buf *dp; 1408 int i; 1409 kmutex_t *hmp; 1410 1411 /* 1412 * check for busy bufs for this filesystem 1413 */ 1414 for (i = 0; i < v.v_hbuf; i++) { 1415 dp = (struct buf *)&hbuf[i]; 1416 hmp = &hbuf[i].b_lock; 1417 1418 mutex_enter(hmp); 1419 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1420 /* 1421 * if buf is busy or dirty, then filesystem is busy 1422 */ 1423 if ((bp->b_edev == dev) && 1424 ((bp->b_flags & B_STALE) == 0) && 1425 (bp->b_flags & (B_DELWRI|B_BUSY)) && 1426 (bp != sbp)) { 1427 mutex_exit(hmp); 1428 return (1); 1429 } 1430 } 1431 mutex_exit(hmp); 1432 } 1433 return (0); 1434 } 1435 1436 /* 1437 * Hash two 32 bit entities. 1438 */ 1439 int 1440 hash2ints(int x, int y) 1441 { 1442 int hash = 0; 1443 1444 hash = x - 1; 1445 hash = ((hash * 7) + (x >> 8)) - 1; 1446 hash = ((hash * 7) + (x >> 16)) - 1; 1447 hash = ((hash * 7) + (x >> 24)) - 1; 1448 hash = ((hash * 7) + y) - 1; 1449 hash = ((hash * 7) + (y >> 8)) - 1; 1450 hash = ((hash * 7) + (y >> 16)) - 1; 1451 hash = ((hash * 7) + (y >> 24)) - 1; 1452 1453 return (hash); 1454 } 1455 1456 1457 /* 1458 * Return a new buffer struct. 1459 * Create a new buffer if we haven't gone over our high water 1460 * mark for memory, otherwise try to get one off the freelist. 1461 * 1462 * Returns a locked buf that has no id and is not on any hash or free 1463 * list. 1464 */ 1465 static struct buf * 1466 bio_getfreeblk(long bsize) 1467 { 1468 struct buf *bp, *dp; 1469 struct hbuf *hp; 1470 kmutex_t *hmp; 1471 uint_t start, end; 1472 1473 /* 1474 * mutex_enter(&bfree_lock); 1475 * bfreelist.b_bufsize represents the amount of memory 1476 * mutex_exit(&bfree_lock); protect ref to bfreelist 1477 * we are allowed to allocate in the cache before we hit our hwm. 1478 */ 1479 bio_mem_get(bsize); /* Account for our memory request */ 1480 1481 again: 1482 bp = bio_bhdr_alloc(); /* Get a buf hdr */ 1483 sema_p(&bp->b_sem); /* Should never fail */ 1484 1485 ASSERT(bp->b_un.b_addr == NULL); 1486 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP); 1487 if (bp->b_un.b_addr != NULL) { 1488 /* 1489 * Make the common path short 1490 */ 1491 bp->b_bufsize = bsize; 1492 ASSERT(SEMA_HELD(&bp->b_sem)); 1493 return (bp); 1494 } else { 1495 struct buf *save; 1496 1497 save = bp; /* Save bp we allocated */ 1498 start = end = lastindex; 1499 1500 biostats.bio_bufwant.value.ui32++; 1501 1502 /* 1503 * Memory isn't available from the system now. Scan 1504 * the hash buckets till enough space is found. 1505 */ 1506 do { 1507 hp = &hbuf[start]; 1508 hmp = &hp->b_lock; 1509 dp = (struct buf *)hp; 1510 1511 mutex_enter(hmp); 1512 bp = dp->av_forw; 1513 1514 while (bp != dp) { 1515 1516 ASSERT(bp != NULL); 1517 1518 if (!sema_tryp(&bp->b_sem)) { 1519 bp = bp->av_forw; 1520 continue; 1521 } 1522 1523 /* 1524 * Since we are going down the freelist 1525 * associated with this hash bucket the 1526 * B_DELWRI flag should not be set. 1527 */ 1528 ASSERT(!(bp->b_flags & B_DELWRI)); 1529 1530 if (bp->b_bufsize == bsize) { 1531 hp->b_length--; 1532 notavail(bp); 1533 bremhash(bp); 1534 mutex_exit(hmp); 1535 1536 /* 1537 * Didn't kmem_alloc any more, so don't 1538 * count it twice. 1539 */ 1540 mutex_enter(&bfree_lock); 1541 bfreelist.b_bufsize += bsize; 1542 mutex_exit(&bfree_lock); 1543 1544 /* 1545 * Update the lastindex value. 1546 */ 1547 lastindex = start; 1548 1549 /* 1550 * Put our saved bp back on the list 1551 */ 1552 sema_v(&save->b_sem); 1553 bio_bhdr_free(save); 1554 ASSERT(SEMA_HELD(&bp->b_sem)); 1555 return (bp); 1556 } 1557 sema_v(&bp->b_sem); 1558 bp = bp->av_forw; 1559 } 1560 mutex_exit(hmp); 1561 start = ((start + 1) % v.v_hbuf); 1562 } while (start != end); 1563 1564 biostats.bio_bufwait.value.ui32++; 1565 bp = save; /* Use original bp */ 1566 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 1567 } 1568 1569 bp->b_bufsize = bsize; 1570 ASSERT(SEMA_HELD(&bp->b_sem)); 1571 return (bp); 1572 } 1573 1574 /* 1575 * Allocate a buffer header. If none currently available, allocate 1576 * a new pool. 1577 */ 1578 static struct buf * 1579 bio_bhdr_alloc(void) 1580 { 1581 struct buf *dp, *sdp; 1582 struct buf *bp; 1583 int i; 1584 1585 for (;;) { 1586 mutex_enter(&bhdr_lock); 1587 if (bhdrlist != NULL) { 1588 bp = bhdrlist; 1589 bhdrlist = bp->av_forw; 1590 mutex_exit(&bhdr_lock); 1591 bp->av_forw = NULL; 1592 return (bp); 1593 } 1594 mutex_exit(&bhdr_lock); 1595 1596 /* 1597 * Need to allocate a new pool. If the system is currently 1598 * out of memory, then try freeing things on the freelist. 1599 */ 1600 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP); 1601 if (dp == NULL) { 1602 /* 1603 * System can't give us a pool of headers, try 1604 * recycling from the free lists. 1605 */ 1606 bio_recycle(BIO_HEADER, 0); 1607 } else { 1608 sdp = dp; 1609 for (i = 0; i < v.v_buf; i++, dp++) { 1610 /* 1611 * The next two lines are needed since NODEV 1612 * is -1 and not NULL 1613 */ 1614 dp->b_dev = (o_dev_t)NODEV; 1615 dp->b_edev = NODEV; 1616 dp->av_forw = dp + 1; 1617 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT, 1618 NULL); 1619 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT, 1620 NULL); 1621 dp->b_offset = -1; 1622 } 1623 mutex_enter(&bhdr_lock); 1624 (--dp)->av_forw = bhdrlist; /* Fix last pointer */ 1625 bhdrlist = sdp; 1626 nbuf += v.v_buf; 1627 bp = bhdrlist; 1628 bhdrlist = bp->av_forw; 1629 mutex_exit(&bhdr_lock); 1630 1631 bp->av_forw = NULL; 1632 return (bp); 1633 } 1634 } 1635 } 1636 1637 static void 1638 bio_bhdr_free(struct buf *bp) 1639 { 1640 ASSERT(bp->b_back == NULL); 1641 ASSERT(bp->b_forw == NULL); 1642 ASSERT(bp->av_back == NULL); 1643 ASSERT(bp->av_forw == NULL); 1644 ASSERT(bp->b_un.b_addr == NULL); 1645 ASSERT(bp->b_dev == (o_dev_t)NODEV); 1646 ASSERT(bp->b_edev == NODEV); 1647 ASSERT(bp->b_flags == 0); 1648 1649 mutex_enter(&bhdr_lock); 1650 bp->av_forw = bhdrlist; 1651 bhdrlist = bp; 1652 mutex_exit(&bhdr_lock); 1653 } 1654 1655 /* 1656 * If we haven't gone over the high water mark, it's o.k. to 1657 * allocate more buffer space, otherwise recycle buffers 1658 * from the freelist until enough memory is free for a bsize request. 1659 * 1660 * We account for this memory, even though 1661 * we don't allocate it here. 1662 */ 1663 static void 1664 bio_mem_get(long bsize) 1665 { 1666 mutex_enter(&bfree_lock); 1667 if (bfreelist.b_bufsize > bsize) { 1668 bfreelist.b_bufsize -= bsize; 1669 mutex_exit(&bfree_lock); 1670 return; 1671 } 1672 mutex_exit(&bfree_lock); 1673 bio_recycle(BIO_MEM, bsize); 1674 } 1675 1676 /* 1677 * flush a list of delayed write buffers. 1678 * (currently used only by bio_recycle below.) 1679 */ 1680 static void 1681 bio_flushlist(struct buf *delwri_list) 1682 { 1683 struct buf *bp; 1684 1685 while (delwri_list != EMPTY_LIST) { 1686 bp = delwri_list; 1687 bp->b_flags |= B_AGE | B_ASYNC; 1688 if (bp->b_vp == NULL) { /* !ufs */ 1689 BWRITE(bp); 1690 } else { /* ufs */ 1691 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 1692 } 1693 delwri_list = bp->b_list; 1694 bp->b_list = NULL; 1695 } 1696 } 1697 1698 /* 1699 * Start recycling buffers on the freelist for one of 2 reasons: 1700 * - we need a buffer header 1701 * - we need to free up memory 1702 * Once started we continue to recycle buffers until the B_AGE 1703 * buffers are gone. 1704 */ 1705 static void 1706 bio_recycle(int want, long bsize) 1707 { 1708 struct buf *bp, *dp, *dwp, *nbp; 1709 struct hbuf *hp; 1710 int found = 0; 1711 kmutex_t *hmp; 1712 int start, end; 1713 struct buf *delwri_list = EMPTY_LIST; 1714 1715 /* 1716 * Recycle buffers. 1717 */ 1718 top: 1719 start = end = lastindex; 1720 do { 1721 hp = &hbuf[start]; 1722 hmp = &hp->b_lock; 1723 dp = (struct buf *)hp; 1724 1725 mutex_enter(hmp); 1726 bp = dp->av_forw; 1727 1728 while (bp != dp) { 1729 1730 ASSERT(bp != NULL); 1731 1732 if (!sema_tryp(&bp->b_sem)) { 1733 bp = bp->av_forw; 1734 continue; 1735 } 1736 /* 1737 * Do we really want to nuke all of the B_AGE stuff?? 1738 */ 1739 if ((bp->b_flags & B_AGE) == 0 && found) { 1740 sema_v(&bp->b_sem); 1741 mutex_exit(hmp); 1742 lastindex = start; 1743 return; /* All done */ 1744 } 1745 1746 ASSERT(MUTEX_HELD(&hp->b_lock)); 1747 ASSERT(!(bp->b_flags & B_DELWRI)); 1748 hp->b_length--; 1749 notavail(bp); 1750 1751 /* 1752 * Remove bhdr from cache, free up memory, 1753 * and add the hdr to the freelist. 1754 */ 1755 bremhash(bp); 1756 mutex_exit(hmp); 1757 1758 if (bp->b_bufsize) { 1759 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 1760 bp->b_un.b_addr = NULL; 1761 mutex_enter(&bfree_lock); 1762 bfreelist.b_bufsize += bp->b_bufsize; 1763 mutex_exit(&bfree_lock); 1764 } 1765 1766 bp->b_dev = (o_dev_t)NODEV; 1767 bp->b_edev = NODEV; 1768 bp->b_flags = 0; 1769 sema_v(&bp->b_sem); 1770 bio_bhdr_free(bp); 1771 if (want == BIO_HEADER) { 1772 found = 1; 1773 } else { 1774 ASSERT(want == BIO_MEM); 1775 if (!found && bfreelist.b_bufsize >= bsize) { 1776 /* Account for the memory we want */ 1777 mutex_enter(&bfree_lock); 1778 if (bfreelist.b_bufsize >= bsize) { 1779 bfreelist.b_bufsize -= bsize; 1780 found = 1; 1781 } 1782 mutex_exit(&bfree_lock); 1783 } 1784 } 1785 1786 /* 1787 * Since we dropped hmp start from the 1788 * begining. 1789 */ 1790 mutex_enter(hmp); 1791 bp = dp->av_forw; 1792 } 1793 mutex_exit(hmp); 1794 1795 /* 1796 * Look at the delayed write list. 1797 * First gather into a private list, then write them. 1798 */ 1799 dwp = (struct buf *)&dwbuf[start]; 1800 mutex_enter(&blist_lock); 1801 bio_doingflush++; 1802 mutex_enter(hmp); 1803 for (bp = dwp->av_forw; bp != dwp; bp = nbp) { 1804 1805 ASSERT(bp != NULL); 1806 nbp = bp->av_forw; 1807 1808 if (!sema_tryp(&bp->b_sem)) 1809 continue; 1810 ASSERT(bp->b_flags & B_DELWRI); 1811 /* 1812 * Do we really want to nuke all of the B_AGE stuff?? 1813 */ 1814 1815 if ((bp->b_flags & B_AGE) == 0 && found) { 1816 sema_v(&bp->b_sem); 1817 mutex_exit(hmp); 1818 lastindex = start; 1819 mutex_exit(&blist_lock); 1820 bio_flushlist(delwri_list); 1821 mutex_enter(&blist_lock); 1822 bio_doingflush--; 1823 if (bio_flinv_cv_wanted) { 1824 bio_flinv_cv_wanted = 0; 1825 cv_broadcast(&bio_flushinval_cv); 1826 } 1827 mutex_exit(&blist_lock); 1828 return; /* All done */ 1829 } 1830 1831 /* 1832 * If the buffer is already on a flush or 1833 * invalidate list then just skip it. 1834 */ 1835 if (bp->b_list != NULL) { 1836 sema_v(&bp->b_sem); 1837 continue; 1838 } 1839 /* 1840 * We are still on the same bucket. 1841 */ 1842 hp->b_length--; 1843 notavail(bp); 1844 bp->b_list = delwri_list; 1845 delwri_list = bp; 1846 } 1847 mutex_exit(hmp); 1848 mutex_exit(&blist_lock); 1849 bio_flushlist(delwri_list); 1850 delwri_list = EMPTY_LIST; 1851 mutex_enter(&blist_lock); 1852 bio_doingflush--; 1853 if (bio_flinv_cv_wanted) { 1854 bio_flinv_cv_wanted = 0; 1855 cv_broadcast(&bio_flushinval_cv); 1856 } 1857 mutex_exit(&blist_lock); 1858 start = (start + 1) % v.v_hbuf; 1859 1860 } while (start != end); 1861 1862 if (found) 1863 return; 1864 1865 /* 1866 * Free lists exhausted and we haven't satisfied the request. 1867 * Wait here for more entries to be added to freelist. 1868 * Because this might have just happened, make it timed. 1869 */ 1870 mutex_enter(&bfree_lock); 1871 bfreelist.b_flags |= B_WANTED; 1872 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK); 1873 mutex_exit(&bfree_lock); 1874 goto top; 1875 } 1876 1877 /* 1878 * See if the block is associated with some buffer 1879 * (mainly to avoid getting hung up on a wait in breada). 1880 */ 1881 static int 1882 bio_incore(dev_t dev, daddr_t blkno) 1883 { 1884 struct buf *bp; 1885 struct buf *dp; 1886 uint_t index; 1887 kmutex_t *hmp; 1888 1889 index = bio_bhash(dev, blkno); 1890 dp = (struct buf *)&hbuf[index]; 1891 hmp = &hbuf[index].b_lock; 1892 1893 mutex_enter(hmp); 1894 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1895 if (bp->b_blkno == blkno && bp->b_edev == dev && 1896 (bp->b_flags & B_STALE) == 0) { 1897 mutex_exit(hmp); 1898 return (1); 1899 } 1900 } 1901 mutex_exit(hmp); 1902 return (0); 1903 } 1904 1905 static void 1906 bio_pageio_done(struct buf *bp) 1907 { 1908 if (bp->b_flags & B_PAGEIO) { 1909 1910 if (bp->b_flags & B_REMAPPED) 1911 bp_mapout(bp); 1912 1913 if (bp->b_flags & B_READ) 1914 pvn_read_done(bp->b_pages, bp->b_flags); 1915 else 1916 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags); 1917 pageio_done(bp); 1918 } else { 1919 ASSERT(bp->b_flags & B_REMAPPED); 1920 bp_mapout(bp); 1921 brelse(bp); 1922 } 1923 } 1924 1925 /* 1926 * bioerror(9F) - indicate error in buffer header 1927 * If 'error' is zero, remove the error indication. 1928 */ 1929 void 1930 bioerror(struct buf *bp, int error) 1931 { 1932 ASSERT(bp != NULL); 1933 ASSERT(error >= 0); 1934 ASSERT(SEMA_HELD(&bp->b_sem)); 1935 1936 if (error != 0) { 1937 bp->b_flags |= B_ERROR; 1938 } else { 1939 bp->b_flags &= ~B_ERROR; 1940 } 1941 bp->b_error = error; 1942 } 1943 1944 /* 1945 * bioreset(9F) - reuse a private buffer header after I/O is complete 1946 */ 1947 void 1948 bioreset(struct buf *bp) 1949 { 1950 ASSERT(bp != NULL); 1951 1952 biofini(bp); 1953 bioinit(bp); 1954 } 1955 1956 /* 1957 * biosize(9F) - return size of a buffer header 1958 */ 1959 size_t 1960 biosize(void) 1961 { 1962 return (sizeof (struct buf)); 1963 } 1964 1965 /* 1966 * biomodified(9F) - check if buffer is modified 1967 */ 1968 int 1969 biomodified(struct buf *bp) 1970 { 1971 int npf; 1972 int ppattr; 1973 struct page *pp; 1974 1975 ASSERT(bp != NULL); 1976 1977 if ((bp->b_flags & B_PAGEIO) == 0) { 1978 return (-1); 1979 } 1980 pp = bp->b_pages; 1981 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET)); 1982 1983 while (npf > 0) { 1984 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 1985 HAT_SYNC_STOPON_MOD); 1986 if (ppattr & P_MOD) 1987 return (1); 1988 pp = pp->p_next; 1989 npf--; 1990 } 1991 1992 return (0); 1993 } 1994 1995 /* 1996 * bioinit(9F) - initialize a buffer structure 1997 */ 1998 void 1999 bioinit(struct buf *bp) 2000 { 2001 bzero(bp, sizeof (struct buf)); 2002 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 2003 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 2004 bp->b_offset = -1; 2005 } 2006 2007 /* 2008 * biofini(9F) - uninitialize a buffer structure 2009 */ 2010 void 2011 biofini(struct buf *bp) 2012 { 2013 sema_destroy(&bp->b_io); 2014 sema_destroy(&bp->b_sem); 2015 } 2016 2017 /* 2018 * bioclone(9F) - clone a buffer 2019 */ 2020 struct buf * 2021 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno, 2022 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep) 2023 { 2024 struct buf *bufp; 2025 2026 ASSERT(bp); 2027 if (bp_mem == NULL) { 2028 bufp = kmem_alloc(sizeof (struct buf), sleep); 2029 if (bufp == NULL) { 2030 return (NULL); 2031 } 2032 bioinit(bufp); 2033 } else { 2034 bufp = bp_mem; 2035 bioreset(bufp); 2036 } 2037 2038 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\ 2039 B_ABRWRITE) 2040 2041 /* 2042 * The cloned buffer does not inherit the B_REMAPPED flag. 2043 */ 2044 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY; 2045 bufp->b_bcount = len; 2046 bufp->b_blkno = blkno; 2047 bufp->b_iodone = iodone; 2048 bufp->b_proc = bp->b_proc; 2049 bufp->b_edev = dev; 2050 bufp->b_file = bp->b_file; 2051 bufp->b_offset = bp->b_offset; 2052 2053 if (bp->b_flags & B_SHADOW) { 2054 ASSERT(bp->b_shadow); 2055 ASSERT(bp->b_flags & B_PHYS); 2056 2057 bufp->b_shadow = bp->b_shadow + 2058 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off); 2059 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2060 if (bp->b_flags & B_REMAPPED) 2061 bufp->b_proc = NULL; 2062 } else { 2063 if (bp->b_flags & B_PAGEIO) { 2064 struct page *pp; 2065 off_t o; 2066 int i; 2067 2068 pp = bp->b_pages; 2069 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off; 2070 for (i = btop(o); i > 0; i--) { 2071 pp = pp->p_next; 2072 } 2073 bufp->b_pages = pp; 2074 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET); 2075 } else { 2076 bufp->b_un.b_addr = 2077 (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2078 if (bp->b_flags & B_REMAPPED) 2079 bufp->b_proc = NULL; 2080 } 2081 } 2082 return (bufp); 2083 } 2084