1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/sysmacros.h> 45 #include <sys/conf.h> 46 #include <sys/cpuvar.h> 47 #include <sys/errno.h> 48 #include <sys/debug.h> 49 #include <sys/buf.h> 50 #include <sys/var.h> 51 #include <sys/vnode.h> 52 #include <sys/bitmap.h> 53 #include <sys/cmn_err.h> 54 #include <sys/kmem.h> 55 #include <sys/vmem.h> 56 #include <sys/atomic.h> 57 #include <vm/seg_kmem.h> 58 #include <vm/page.h> 59 #include <vm/pvn.h> 60 #include <sys/vtrace.h> 61 #include <sys/tnf_probe.h> 62 #include <sys/fs/ufs_inode.h> 63 #include <sys/fs/ufs_bio.h> 64 #include <sys/fs/ufs_log.h> 65 #include <sys/systm.h> 66 #include <sys/vfs.h> 67 #include <sys/sdt.h> 68 69 /* Locks */ 70 static kmutex_t blist_lock; /* protects b_list */ 71 static kmutex_t bhdr_lock; /* protects the bhdrlist */ 72 static kmutex_t bfree_lock; /* protects the bfreelist structure */ 73 74 struct hbuf *hbuf; /* Hash buckets */ 75 struct dwbuf *dwbuf; /* Delayed write buckets */ 76 static struct buf *bhdrlist; /* buf header free list */ 77 static int nbuf; /* number of buffer headers allocated */ 78 79 static int lastindex; /* Reference point on where to start */ 80 /* when looking for free buffers */ 81 82 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask) 83 #define EMPTY_LIST ((struct buf *)-1) 84 85 static kcondvar_t bio_mem_cv; /* Condition variables */ 86 static kcondvar_t bio_flushinval_cv; 87 static int bio_doingflush; /* flush in progress */ 88 static int bio_doinginval; /* inval in progress */ 89 static int bio_flinv_cv_wanted; /* someone waiting for cv */ 90 91 /* 92 * Statistics on the buffer cache 93 */ 94 struct biostats biostats = { 95 { "buffer_cache_lookups", KSTAT_DATA_UINT32 }, 96 { "buffer_cache_hits", KSTAT_DATA_UINT32 }, 97 { "new_buffer_requests", KSTAT_DATA_UINT32 }, 98 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 }, 99 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 }, 100 { "duplicate_buffers_found", KSTAT_DATA_UINT32 } 101 }; 102 103 /* 104 * kstat data 105 */ 106 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats; 107 uint_t biostats_ndata = (uint_t)(sizeof (biostats) / 108 sizeof (kstat_named_t)); 109 110 /* 111 * Statistics on ufs buffer cache 112 * Not protected by locks 113 */ 114 struct ufsbiostats ub = { 115 { "breads", KSTAT_DATA_UINT32 }, 116 { "bwrites", KSTAT_DATA_UINT32 }, 117 { "fbiwrites", KSTAT_DATA_UINT32 }, 118 { "getpages", KSTAT_DATA_UINT32 }, 119 { "getras", KSTAT_DATA_UINT32 }, 120 { "putsyncs", KSTAT_DATA_UINT32 }, 121 { "putasyncs", KSTAT_DATA_UINT32 }, 122 { "putpageios", KSTAT_DATA_UINT32 }, 123 }; 124 125 /* 126 * more UFS Logging eccentricities... 127 * 128 * required since "#pragma weak ..." doesn't work in reverse order. 129 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers 130 * to ufs routines don't get plugged into bio.c calls so 131 * we initialize it when setting up the "lufsops" table 132 * in "lufs.c:_init()" 133 */ 134 void (*bio_lufs_strategy)(void *, buf_t *); 135 void (*bio_snapshot_strategy)(void *, buf_t *); 136 137 138 /* Private routines */ 139 static struct buf *bio_getfreeblk(long); 140 static void bio_mem_get(long); 141 static void bio_bhdr_free(struct buf *); 142 static struct buf *bio_bhdr_alloc(void); 143 static void bio_recycle(int, long); 144 static void bio_pageio_done(struct buf *); 145 static int bio_incore(dev_t, daddr_t); 146 147 /* 148 * Buffer cache constants 149 */ 150 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */ 151 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */ 152 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */ 153 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */ 154 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024) 155 #define BIO_HASHLEN 4 /* Target length of hash chains */ 156 157 158 /* Flags for bio_recycle() */ 159 #define BIO_HEADER 0x01 160 #define BIO_MEM 0x02 161 162 extern int bufhwm; /* User tunable - high water mark for mem */ 163 extern int bufhwm_pct; /* ditto - given in % of physmem */ 164 165 /* 166 * The following routines allocate and free 167 * buffers with various side effects. In general the 168 * arguments to an allocate routine are a device and 169 * a block number, and the value is a pointer to 170 * to the buffer header; the buffer returned is locked with a 171 * binary semaphore so that no one else can touch it. If the block was 172 * already in core, no I/O need be done; if it is 173 * already locked, the process waits until it becomes free. 174 * The following routines allocate a buffer: 175 * getblk 176 * bread/BREAD 177 * breada 178 * Eventually the buffer must be released, possibly with the 179 * side effect of writing it out, by using one of 180 * bwrite/BWRITE/brwrite 181 * bdwrite/bdrwrite 182 * bawrite 183 * brelse 184 * 185 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization. 186 * Instead, a binary semaphore, b_sem is used to gain exclusive access to 187 * a buffer and a binary semaphore, b_io is used for I/O synchronization. 188 * B_DONE is still used to denote a buffer with I/O complete on it. 189 * 190 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is 191 * should not be used where a very accurate count of the free buffers is 192 * needed. 193 */ 194 195 /* 196 * Read in (if necessary) the block and return a buffer pointer. 197 * 198 * This interface is provided for binary compatibility. Using 199 * BREAD() directly avoids the extra function call overhead invoked 200 * by calling this routine. 201 */ 202 struct buf * 203 bread(dev_t dev, daddr_t blkno, long bsize) 204 { 205 return (BREAD(dev, blkno, bsize)); 206 } 207 208 /* 209 * Common code for reading a buffer with various options 210 * 211 * Read in (if necessary) the block and return a buffer pointer. 212 */ 213 struct buf * 214 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize) 215 { 216 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 217 struct buf *bp; 218 klwp_t *lwp = ttolwp(curthread); 219 220 CPU_STATS_ADD_K(sys, lread, 1); 221 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1); 222 if (bp->b_flags & B_DONE) 223 return (bp); 224 bp->b_flags |= B_READ; 225 ASSERT(bp->b_bcount == bsize); 226 if (ufsvfsp == NULL) { /* !ufs */ 227 (void) bdev_strategy(bp); 228 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 229 /* ufs && logging */ 230 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 231 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 232 /* ufs && snapshots */ 233 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 234 } else { 235 ufsvfsp->vfs_iotstamp = lbolt; 236 ub.ub_breads.value.ul++; /* ufs && !logging */ 237 (void) bdev_strategy(bp); 238 } 239 if (lwp != NULL) 240 lwp->lwp_ru.inblock++; 241 CPU_STATS_ADD_K(sys, bread, 1); 242 (void) biowait(bp); 243 return (bp); 244 } 245 246 /* 247 * Read in the block, like bread, but also start I/O on the 248 * read-ahead block (which is not allocated to the caller). 249 */ 250 struct buf * 251 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize) 252 { 253 struct buf *bp, *rabp; 254 klwp_t *lwp = ttolwp(curthread); 255 256 bp = NULL; 257 if (!bio_incore(dev, blkno)) { 258 CPU_STATS_ADD_K(sys, lread, 1); 259 bp = GETBLK(dev, blkno, bsize); 260 if ((bp->b_flags & B_DONE) == 0) { 261 bp->b_flags |= B_READ; 262 bp->b_bcount = bsize; 263 (void) bdev_strategy(bp); 264 if (lwp != NULL) 265 lwp->lwp_ru.inblock++; 266 CPU_STATS_ADD_K(sys, bread, 1); 267 } 268 } 269 if (rablkno && bfreelist.b_bcount > 1 && 270 !bio_incore(dev, rablkno)) { 271 rabp = GETBLK(dev, rablkno, bsize); 272 if (rabp->b_flags & B_DONE) 273 brelse(rabp); 274 else { 275 rabp->b_flags |= B_READ|B_ASYNC; 276 rabp->b_bcount = bsize; 277 (void) bdev_strategy(rabp); 278 if (lwp != NULL) 279 lwp->lwp_ru.inblock++; 280 CPU_STATS_ADD_K(sys, bread, 1); 281 } 282 } 283 if (bp == NULL) 284 return (BREAD(dev, blkno, bsize)); 285 (void) biowait(bp); 286 return (bp); 287 } 288 289 /* 290 * Common code for writing a buffer with various options. 291 * 292 * force_wait - wait for write completion regardless of B_ASYNC flag 293 * do_relse - release the buffer when we are done 294 * clear_flags - flags to clear from the buffer 295 */ 296 void 297 bwrite_common(void *arg, struct buf *bp, int force_wait, 298 int do_relse, int clear_flags) 299 { 300 register int do_wait; 301 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 302 int flag; 303 klwp_t *lwp = ttolwp(curthread); 304 struct cpu *cpup; 305 306 ASSERT(SEMA_HELD(&bp->b_sem)); 307 flag = bp->b_flags; 308 bp->b_flags &= ~clear_flags; 309 if (lwp != NULL) 310 lwp->lwp_ru.oublock++; 311 CPU_STATS_ENTER_K(); 312 cpup = CPU; /* get pointer AFTER preemption is disabled */ 313 CPU_STATS_ADDQ(cpup, sys, lwrite, 1); 314 CPU_STATS_ADDQ(cpup, sys, bwrite, 1); 315 do_wait = ((flag & B_ASYNC) == 0 || force_wait); 316 if (do_wait == 0) 317 CPU_STATS_ADDQ(cpup, sys, bawrite, 1); 318 CPU_STATS_EXIT_K(); 319 if (ufsvfsp == NULL) { 320 (void) bdev_strategy(bp); 321 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 322 /* ufs && logging */ 323 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 324 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 325 /* ufs && snapshots */ 326 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 327 } else { 328 ub.ub_bwrites.value.ul++; /* ufs && !logging */ 329 (void) bdev_strategy(bp); 330 } 331 if (do_wait) { 332 (void) biowait(bp); 333 if (do_relse) { 334 brelse(bp); 335 } 336 } 337 } 338 339 /* 340 * Write the buffer, waiting for completion (unless B_ASYNC is set). 341 * Then release the buffer. 342 * This interface is provided for binary compatibility. Using 343 * BWRITE() directly avoids the extra function call overhead invoked 344 * by calling this routine. 345 */ 346 void 347 bwrite(struct buf *bp) 348 { 349 BWRITE(bp); 350 } 351 352 /* 353 * Write the buffer, waiting for completion. 354 * But don't release the buffer afterwards. 355 * This interface is provided for binary compatibility. Using 356 * BWRITE2() directly avoids the extra function call overhead. 357 */ 358 void 359 bwrite2(struct buf *bp) 360 { 361 BWRITE2(bp); 362 } 363 364 /* 365 * Release the buffer, marking it so that if it is grabbed 366 * for another purpose it will be written out before being 367 * given up (e.g. when writing a partial block where it is 368 * assumed that another write for the same block will soon follow). 369 * Also save the time that the block is first marked as delayed 370 * so that it will be written in a reasonable time. 371 */ 372 void 373 bdwrite(struct buf *bp) 374 { 375 ASSERT(SEMA_HELD(&bp->b_sem)); 376 CPU_STATS_ADD_K(sys, lwrite, 1); 377 if ((bp->b_flags & B_DELWRI) == 0) 378 bp->b_start = lbolt; 379 /* 380 * B_DONE allows others to use the buffer, B_DELWRI causes the 381 * buffer to be written before being reused, and setting b_resid 382 * to zero says the buffer is complete. 383 */ 384 bp->b_flags |= B_DELWRI | B_DONE; 385 bp->b_resid = 0; 386 brelse(bp); 387 } 388 389 /* 390 * Release the buffer, start I/O on it, but don't wait for completion. 391 */ 392 void 393 bawrite(struct buf *bp) 394 { 395 ASSERT(SEMA_HELD(&bp->b_sem)); 396 397 /* Use bfreelist.b_bcount as a weird-ass heuristic */ 398 if (bfreelist.b_bcount > 4) 399 bp->b_flags |= B_ASYNC; 400 BWRITE(bp); 401 } 402 403 /* 404 * Release the buffer, with no I/O implied. 405 */ 406 void 407 brelse(struct buf *bp) 408 { 409 struct buf **backp; 410 uint_t index; 411 kmutex_t *hmp; 412 struct buf *dp; 413 struct hbuf *hp; 414 415 416 ASSERT(SEMA_HELD(&bp->b_sem)); 417 418 /* 419 * Clear the retry write flag if the buffer was written without 420 * error. The presence of B_DELWRI means the buffer has not yet 421 * been written and the presence of B_ERROR means that an error 422 * is still occurring. 423 */ 424 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) { 425 bp->b_flags &= ~B_RETRYWRI; 426 } 427 428 /* Check for anomalous conditions */ 429 if (bp->b_flags & (B_ERROR|B_NOCACHE)) { 430 if (bp->b_flags & B_NOCACHE) { 431 /* Don't add to the freelist. Destroy it now */ 432 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 433 sema_destroy(&bp->b_sem); 434 sema_destroy(&bp->b_io); 435 kmem_free(bp, sizeof (struct buf)); 436 return; 437 } 438 /* 439 * If a write failed and we are supposed to retry write, 440 * don't toss the buffer. Keep it around and mark it 441 * delayed write in the hopes that it will eventually 442 * get flushed (and still keep the system running.) 443 */ 444 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) { 445 bp->b_flags |= B_DELWRI; 446 /* keep fsflush from trying continuously to flush */ 447 bp->b_start = lbolt; 448 } else 449 bp->b_flags |= B_AGE|B_STALE; 450 bp->b_flags &= ~B_ERROR; 451 bp->b_error = 0; 452 } 453 454 /* 455 * If delayed write is set then put in on the delayed 456 * write list instead of the free buffer list. 457 */ 458 index = bio_bhash(bp->b_edev, bp->b_blkno); 459 hmp = &hbuf[index].b_lock; 460 461 mutex_enter(hmp); 462 hp = &hbuf[index]; 463 dp = (struct buf *)hp; 464 465 /* 466 * Make sure that the number of entries on this list are 467 * Zero <= count <= total # buffers 468 */ 469 ASSERT(hp->b_length >= 0); 470 ASSERT(hp->b_length < nbuf); 471 472 hp->b_length++; /* We are adding this buffer */ 473 474 if (bp->b_flags & B_DELWRI) { 475 /* 476 * This buffer goes on the delayed write buffer list 477 */ 478 dp = (struct buf *)&dwbuf[index]; 479 } 480 ASSERT(bp->b_bufsize > 0); 481 ASSERT(bp->b_bcount > 0); 482 ASSERT(bp->b_un.b_addr != NULL); 483 484 if (bp->b_flags & B_AGE) { 485 backp = &dp->av_forw; 486 (*backp)->av_back = bp; 487 bp->av_forw = *backp; 488 *backp = bp; 489 bp->av_back = dp; 490 } else { 491 backp = &dp->av_back; 492 (*backp)->av_forw = bp; 493 bp->av_back = *backp; 494 *backp = bp; 495 bp->av_forw = dp; 496 } 497 mutex_exit(hmp); 498 499 if (bfreelist.b_flags & B_WANTED) { 500 /* 501 * Should come here very very rarely. 502 */ 503 mutex_enter(&bfree_lock); 504 if (bfreelist.b_flags & B_WANTED) { 505 bfreelist.b_flags &= ~B_WANTED; 506 cv_broadcast(&bio_mem_cv); 507 } 508 mutex_exit(&bfree_lock); 509 } 510 511 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC); 512 /* 513 * Don't let anyone get the buffer off the freelist before we 514 * release our hold on it. 515 */ 516 sema_v(&bp->b_sem); 517 } 518 519 /* 520 * Return a count of the number of B_BUSY buffers in the system 521 * Can only be used as a good estimate. If 'cleanit' is set, 522 * try to flush all bufs. 523 */ 524 int 525 bio_busy(int cleanit) 526 { 527 struct buf *bp, *dp; 528 int busy = 0; 529 int i; 530 kmutex_t *hmp; 531 532 for (i = 0; i < v.v_hbuf; i++) { 533 vfs_syncprogress(); 534 dp = (struct buf *)&hbuf[i]; 535 hmp = &hbuf[i].b_lock; 536 537 mutex_enter(hmp); 538 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 539 if (bp->b_flags & B_BUSY) 540 busy++; 541 } 542 mutex_exit(hmp); 543 } 544 545 if (cleanit && busy != 0) { 546 bflush(NODEV); 547 } 548 549 return (busy); 550 } 551 552 /* 553 * this interface is provided for binary compatibility. 554 * 555 * Assign a buffer for the given block. If the appropriate 556 * block is already associated, return it; otherwise search 557 * for the oldest non-busy buffer and reassign it. 558 */ 559 struct buf * 560 getblk(dev_t dev, daddr_t blkno, long bsize) 561 { 562 return (getblk_common(/* ufsvfsp */ NULL, dev, 563 blkno, bsize, /* errflg */ 0)); 564 } 565 566 /* 567 * Assign a buffer for the given block. If the appropriate 568 * block is already associated, return it; otherwise search 569 * for the oldest non-busy buffer and reassign it. 570 */ 571 struct buf * 572 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg) 573 { 574 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg; 575 struct buf *bp; 576 struct buf *dp; 577 struct buf *nbp = NULL; 578 struct buf *errbp; 579 uint_t index; 580 kmutex_t *hmp; 581 struct hbuf *hp; 582 583 if (getmajor(dev) >= devcnt) 584 cmn_err(CE_PANIC, "blkdev"); 585 586 biostats.bio_lookup.value.ui32++; 587 588 index = bio_bhash(dev, blkno); 589 hp = &hbuf[index]; 590 dp = (struct buf *)hp; 591 hmp = &hp->b_lock; 592 593 mutex_enter(hmp); 594 loop: 595 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 596 if (bp->b_blkno != blkno || bp->b_edev != dev || 597 (bp->b_flags & B_STALE)) 598 continue; 599 /* 600 * Avoid holding the hash lock in the event that 601 * the buffer is locked by someone. Since the hash chain 602 * may change when we drop the hash lock 603 * we have to start at the beginning of the chain if the 604 * buffer identity/contents aren't valid. 605 */ 606 if (!sema_tryp(&bp->b_sem)) { 607 biostats.bio_bufbusy.value.ui32++; 608 mutex_exit(hmp); 609 /* 610 * OK, we are dealing with a busy buffer. 611 * In the case that we are panicking and we 612 * got called from bread(), we have some chance 613 * for error recovery. So better bail out from 614 * here since sema_p() won't block. If we got 615 * called directly from ufs routines, there is 616 * no way to report an error yet. 617 */ 618 if (panicstr && errflg) 619 goto errout; 620 /* 621 * For the following line of code to work 622 * correctly never kmem_free the buffer "header". 623 */ 624 sema_p(&bp->b_sem); 625 if (bp->b_blkno != blkno || bp->b_edev != dev || 626 (bp->b_flags & B_STALE)) { 627 sema_v(&bp->b_sem); 628 mutex_enter(hmp); 629 goto loop; /* start over */ 630 } 631 mutex_enter(hmp); 632 } 633 /* Found */ 634 biostats.bio_hit.value.ui32++; 635 bp->b_flags &= ~B_AGE; 636 637 /* 638 * Yank it off the free/delayed write lists 639 */ 640 hp->b_length--; 641 notavail(bp); 642 mutex_exit(hmp); 643 644 ASSERT((bp->b_flags & B_NOCACHE) == NULL); 645 646 if (nbp == NULL) { 647 /* 648 * Make the common path short. 649 */ 650 ASSERT(SEMA_HELD(&bp->b_sem)); 651 return (bp); 652 } 653 654 biostats.bio_bufdup.value.ui32++; 655 656 /* 657 * The buffer must have entered during the lock upgrade 658 * so free the new buffer we allocated and return the 659 * found buffer. 660 */ 661 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize); 662 nbp->b_un.b_addr = NULL; 663 664 /* 665 * Account for the memory 666 */ 667 mutex_enter(&bfree_lock); 668 bfreelist.b_bufsize += nbp->b_bufsize; 669 mutex_exit(&bfree_lock); 670 671 /* 672 * Destroy buf identity, and place on avail list 673 */ 674 nbp->b_dev = (o_dev_t)NODEV; 675 nbp->b_edev = NODEV; 676 nbp->b_flags = 0; 677 nbp->b_file = NULL; 678 nbp->b_offset = -1; 679 680 sema_v(&nbp->b_sem); 681 bio_bhdr_free(nbp); 682 683 ASSERT(SEMA_HELD(&bp->b_sem)); 684 return (bp); 685 } 686 687 /* 688 * bio_getfreeblk may block so check the hash chain again. 689 */ 690 if (nbp == NULL) { 691 mutex_exit(hmp); 692 nbp = bio_getfreeblk(bsize); 693 mutex_enter(hmp); 694 goto loop; 695 } 696 697 /* 698 * New buffer. Assign nbp and stick it on the hash. 699 */ 700 nbp->b_flags = B_BUSY; 701 nbp->b_edev = dev; 702 nbp->b_dev = (o_dev_t)cmpdev(dev); 703 nbp->b_blkno = blkno; 704 nbp->b_iodone = NULL; 705 nbp->b_bcount = bsize; 706 /* 707 * If we are given a ufsvfsp and the vfs_root field is NULL 708 * then this must be I/O for a superblock. A superblock's 709 * buffer is set up in mountfs() and there is no root vnode 710 * at that point. 711 */ 712 if (ufsvfsp && ufsvfsp->vfs_root) { 713 nbp->b_vp = ufsvfsp->vfs_root; 714 } else { 715 nbp->b_vp = NULL; 716 } 717 718 ASSERT((nbp->b_flags & B_NOCACHE) == NULL); 719 720 binshash(nbp, dp); 721 mutex_exit(hmp); 722 723 ASSERT(SEMA_HELD(&nbp->b_sem)); 724 725 return (nbp); 726 727 728 /* 729 * Come here in case of an internal error. At this point we couldn't 730 * get a buffer, but he have to return one. Hence we allocate some 731 * kind of error reply buffer on the fly. This buffer is marked as 732 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following: 733 * - B_ERROR will indicate error to the caller. 734 * - B_DONE will prevent us from reading the buffer from 735 * the device. 736 * - B_NOCACHE will cause that this buffer gets free'd in 737 * brelse(). 738 */ 739 740 errout: 741 errbp = geteblk(); 742 sema_p(&errbp->b_sem); 743 errbp->b_flags &= ~B_BUSY; 744 errbp->b_flags |= (B_ERROR | B_DONE); 745 return (errbp); 746 } 747 748 /* 749 * Get an empty block, not assigned to any particular device. 750 * Returns a locked buffer that is not on any hash or free list. 751 */ 752 struct buf * 753 ngeteblk(long bsize) 754 { 755 struct buf *bp; 756 757 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP); 758 bioinit(bp); 759 bp->av_forw = bp->av_back = NULL; 760 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 761 bp->b_bufsize = bsize; 762 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE; 763 bp->b_dev = (o_dev_t)NODEV; 764 bp->b_edev = NODEV; 765 bp->b_lblkno = 0; 766 bp->b_bcount = bsize; 767 bp->b_iodone = NULL; 768 return (bp); 769 } 770 771 /* 772 * Interface of geteblk() is kept intact to maintain driver compatibility. 773 * Use ngeteblk() to allocate block size other than 1 KB. 774 */ 775 struct buf * 776 geteblk(void) 777 { 778 return (ngeteblk((long)1024)); 779 } 780 781 /* 782 * Return a buffer w/o sleeping 783 */ 784 struct buf * 785 trygetblk(dev_t dev, daddr_t blkno) 786 { 787 struct buf *bp; 788 struct buf *dp; 789 struct hbuf *hp; 790 kmutex_t *hmp; 791 uint_t index; 792 793 index = bio_bhash(dev, blkno); 794 hp = &hbuf[index]; 795 hmp = &hp->b_lock; 796 797 if (!mutex_tryenter(hmp)) 798 return (NULL); 799 800 dp = (struct buf *)hp; 801 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 802 if (bp->b_blkno != blkno || bp->b_edev != dev || 803 (bp->b_flags & B_STALE)) 804 continue; 805 /* 806 * Get access to a valid buffer without sleeping 807 */ 808 if (sema_tryp(&bp->b_sem)) { 809 if (bp->b_flags & B_DONE) { 810 hp->b_length--; 811 notavail(bp); 812 mutex_exit(hmp); 813 return (bp); 814 } else { 815 sema_v(&bp->b_sem); 816 break; 817 } 818 } 819 break; 820 } 821 mutex_exit(hmp); 822 return (NULL); 823 } 824 825 /* 826 * Wait for I/O completion on the buffer; return errors 827 * to the user. 828 */ 829 int 830 iowait(struct buf *bp) 831 { 832 ASSERT(SEMA_HELD(&bp->b_sem)); 833 return (biowait(bp)); 834 } 835 836 /* 837 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 838 * and wake up anyone waiting for it. 839 */ 840 void 841 iodone(struct buf *bp) 842 { 843 ASSERT(SEMA_HELD(&bp->b_sem)); 844 (void) biodone(bp); 845 } 846 847 /* 848 * Zero the core associated with a buffer. 849 */ 850 void 851 clrbuf(struct buf *bp) 852 { 853 ASSERT(SEMA_HELD(&bp->b_sem)); 854 bzero(bp->b_un.b_addr, bp->b_bcount); 855 bp->b_resid = 0; 856 } 857 858 859 /* 860 * Make sure all write-behind blocks on dev (or NODEV for all) 861 * are flushed out. 862 */ 863 void 864 bflush(dev_t dev) 865 { 866 struct buf *bp, *dp; 867 struct hbuf *hp; 868 struct buf *delwri_list = EMPTY_LIST; 869 int i, index; 870 kmutex_t *hmp; 871 872 mutex_enter(&blist_lock); 873 /* 874 * Wait for any invalidates or flushes ahead of us to finish. 875 * We really could split blist_lock up per device for better 876 * parallelism here. 877 */ 878 while (bio_doinginval || bio_doingflush) { 879 bio_flinv_cv_wanted = 1; 880 cv_wait(&bio_flushinval_cv, &blist_lock); 881 } 882 bio_doingflush++; 883 /* 884 * Gather all B_DELWRI buffer for device. 885 * Lock ordering is b_sem > hash lock (brelse). 886 * Since we are finding the buffer via the delayed write list, 887 * it may be busy and we would block trying to get the 888 * b_sem lock while holding hash lock. So transfer all the 889 * candidates on the delwri_list and then drop the hash locks. 890 */ 891 for (i = 0; i < v.v_hbuf; i++) { 892 vfs_syncprogress(); 893 hmp = &hbuf[i].b_lock; 894 dp = (struct buf *)&dwbuf[i]; 895 mutex_enter(hmp); 896 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) { 897 if (dev == NODEV || bp->b_edev == dev) { 898 if (bp->b_list == NULL) { 899 bp->b_list = delwri_list; 900 delwri_list = bp; 901 } 902 } 903 } 904 mutex_exit(hmp); 905 } 906 mutex_exit(&blist_lock); 907 908 /* 909 * Now that the hash locks have been dropped grab the semaphores 910 * and write back all the buffers that have B_DELWRI set. 911 */ 912 while (delwri_list != EMPTY_LIST) { 913 vfs_syncprogress(); 914 bp = delwri_list; 915 916 sema_p(&bp->b_sem); /* may block */ 917 if ((dev != bp->b_edev && dev != NODEV) || 918 (panicstr && bp->b_flags & B_BUSY)) { 919 sema_v(&bp->b_sem); 920 delwri_list = bp->b_list; 921 bp->b_list = NULL; 922 continue; /* No longer a candidate */ 923 } 924 if (bp->b_flags & B_DELWRI) { 925 index = bio_bhash(bp->b_edev, bp->b_blkno); 926 hp = &hbuf[index]; 927 hmp = &hp->b_lock; 928 dp = (struct buf *)hp; 929 930 bp->b_flags |= B_ASYNC; 931 mutex_enter(hmp); 932 hp->b_length--; 933 notavail(bp); 934 mutex_exit(hmp); 935 if (bp->b_vp == NULL) { /* !ufs */ 936 BWRITE(bp); 937 } else { /* ufs */ 938 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 939 } 940 } else { 941 sema_v(&bp->b_sem); 942 } 943 delwri_list = bp->b_list; 944 bp->b_list = NULL; 945 } 946 mutex_enter(&blist_lock); 947 bio_doingflush--; 948 if (bio_flinv_cv_wanted) { 949 bio_flinv_cv_wanted = 0; 950 cv_broadcast(&bio_flushinval_cv); 951 } 952 mutex_exit(&blist_lock); 953 } 954 955 /* 956 * Ensure that a specified block is up-to-date on disk. 957 */ 958 void 959 blkflush(dev_t dev, daddr_t blkno) 960 { 961 struct buf *bp, *dp; 962 struct hbuf *hp; 963 struct buf *sbp = NULL; 964 uint_t index; 965 kmutex_t *hmp; 966 967 index = bio_bhash(dev, blkno); 968 hp = &hbuf[index]; 969 dp = (struct buf *)hp; 970 hmp = &hp->b_lock; 971 972 /* 973 * Identify the buffer in the cache belonging to 974 * this device and blkno (if any). 975 */ 976 mutex_enter(hmp); 977 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 978 if (bp->b_blkno != blkno || bp->b_edev != dev || 979 (bp->b_flags & B_STALE)) 980 continue; 981 sbp = bp; 982 break; 983 } 984 mutex_exit(hmp); 985 if (sbp == NULL) 986 return; 987 /* 988 * Now check the buffer we have identified and 989 * make sure it still belongs to the device and is B_DELWRI 990 */ 991 sema_p(&sbp->b_sem); 992 if (sbp->b_blkno == blkno && sbp->b_edev == dev && 993 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) { 994 mutex_enter(hmp); 995 hp->b_length--; 996 notavail(sbp); 997 mutex_exit(hmp); 998 /* 999 * XXX - There is nothing to guarantee a synchronous 1000 * write here if the B_ASYNC flag is set. This needs 1001 * some investigation. 1002 */ 1003 if (sbp->b_vp == NULL) { /* !ufs */ 1004 BWRITE(sbp); /* synchronous write */ 1005 } else { /* ufs */ 1006 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp); 1007 } 1008 } else { 1009 sema_v(&sbp->b_sem); 1010 } 1011 } 1012 1013 /* 1014 * Same as binval, except can force-invalidate delayed-write buffers 1015 * (which are not be already flushed because of device errors). Also 1016 * makes sure that the retry write flag is cleared. 1017 */ 1018 int 1019 bfinval(dev_t dev, int force) 1020 { 1021 struct buf *dp; 1022 struct buf *bp; 1023 struct buf *binval_list = EMPTY_LIST; 1024 int i, error = 0; 1025 kmutex_t *hmp; 1026 uint_t index; 1027 struct buf **backp; 1028 1029 mutex_enter(&blist_lock); 1030 /* 1031 * Wait for any flushes ahead of us to finish, it's ok to 1032 * do invalidates in parallel. 1033 */ 1034 while (bio_doingflush) { 1035 bio_flinv_cv_wanted = 1; 1036 cv_wait(&bio_flushinval_cv, &blist_lock); 1037 } 1038 bio_doinginval++; 1039 1040 /* Gather bp's */ 1041 for (i = 0; i < v.v_hbuf; i++) { 1042 dp = (struct buf *)&hbuf[i]; 1043 hmp = &hbuf[i].b_lock; 1044 1045 mutex_enter(hmp); 1046 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1047 if (bp->b_edev == dev) { 1048 if (bp->b_list == NULL) { 1049 bp->b_list = binval_list; 1050 binval_list = bp; 1051 } 1052 } 1053 } 1054 mutex_exit(hmp); 1055 } 1056 mutex_exit(&blist_lock); 1057 1058 /* Invalidate all bp's found */ 1059 while (binval_list != EMPTY_LIST) { 1060 bp = binval_list; 1061 1062 sema_p(&bp->b_sem); 1063 if (bp->b_edev == dev) { 1064 if (force && (bp->b_flags & B_DELWRI)) { 1065 /* clear B_DELWRI, move to non-dw freelist */ 1066 index = bio_bhash(bp->b_edev, bp->b_blkno); 1067 hmp = &hbuf[index].b_lock; 1068 dp = (struct buf *)&hbuf[index]; 1069 mutex_enter(hmp); 1070 1071 /* remove from delayed write freelist */ 1072 notavail(bp); 1073 1074 /* add to B_AGE side of non-dw freelist */ 1075 backp = &dp->av_forw; 1076 (*backp)->av_back = bp; 1077 bp->av_forw = *backp; 1078 *backp = bp; 1079 bp->av_back = dp; 1080 1081 /* 1082 * make sure write retries and busy are cleared 1083 */ 1084 bp->b_flags &= 1085 ~(B_BUSY | B_DELWRI | B_RETRYWRI); 1086 mutex_exit(hmp); 1087 } 1088 if ((bp->b_flags & B_DELWRI) == 0) 1089 bp->b_flags |= B_STALE|B_AGE; 1090 else 1091 error = EIO; 1092 } 1093 sema_v(&bp->b_sem); 1094 binval_list = bp->b_list; 1095 bp->b_list = NULL; 1096 } 1097 mutex_enter(&blist_lock); 1098 bio_doinginval--; 1099 if (bio_flinv_cv_wanted) { 1100 cv_broadcast(&bio_flushinval_cv); 1101 bio_flinv_cv_wanted = 0; 1102 } 1103 mutex_exit(&blist_lock); 1104 return (error); 1105 } 1106 1107 /* 1108 * If possible, invalidate blocks for a dev on demand 1109 */ 1110 void 1111 binval(dev_t dev) 1112 { 1113 (void) bfinval(dev, 0); 1114 } 1115 1116 /* 1117 * Initialize the buffer I/O system by freeing 1118 * all buffers and setting all device hash buffer lists to empty. 1119 */ 1120 void 1121 binit(void) 1122 { 1123 struct buf *bp; 1124 unsigned int i, pct; 1125 ulong_t bio_max_hwm, bio_default_hwm; 1126 1127 /* 1128 * Maximum/Default values for bufhwm are set to the smallest of: 1129 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory 1130 * - 1/4 of kernel virtual memory 1131 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int). 1132 * Additionally, in order to allow simple tuning by percentage of 1133 * physical memory, bufhwm_pct is used to calculate the default if 1134 * the value of this tunable is between 0 and BIO_MAX_PERCENT. 1135 * 1136 * Since the unit for v.v_bufhwm is kilobytes, this allows for 1137 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers. 1138 */ 1139 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT, 1140 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1141 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm); 1142 1143 pct = BIO_BUF_PERCENT; 1144 if (bufhwm_pct != 0 && 1145 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) { 1146 pct = BIO_BUF_PERCENT; 1147 /* 1148 * Invalid user specified value, emit a warning. 1149 */ 1150 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \ 1151 range(1..%d). Using %d as default.", 1152 bufhwm_pct, 1153 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT); 1154 } 1155 1156 bio_default_hwm = MIN(physmem / pct, 1157 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1158 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm); 1159 1160 if ((v.v_bufhwm = bufhwm) == 0) 1161 v.v_bufhwm = bio_default_hwm; 1162 1163 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) { 1164 v.v_bufhwm = (int)bio_max_hwm; 1165 /* 1166 * Invalid user specified value, emit a warning. 1167 */ 1168 cmn_err(CE_WARN, 1169 "binit: bufhwm(%d) out \ 1170 of range(%d..%lu). Using %lu as default", 1171 bufhwm, 1172 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm); 1173 } 1174 1175 /* 1176 * Determine the number of hash buckets. Default is to 1177 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers. 1178 * Round up number to the next power of 2. 1179 */ 1180 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) / 1181 BIO_HASHLEN); 1182 v.v_hmask = v.v_hbuf - 1; 1183 v.v_buf = BIO_BHDR_POOL; 1184 1185 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP); 1186 1187 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP); 1188 1189 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024; 1190 bp = &bfreelist; 1191 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp; 1192 1193 for (i = 0; i < v.v_hbuf; i++) { 1194 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i]; 1195 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i]; 1196 1197 /* 1198 * Initialize the delayed write buffer list. 1199 */ 1200 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i]; 1201 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i]; 1202 } 1203 } 1204 1205 /* 1206 * Wait for I/O completion on the buffer; return error code. 1207 * If bp was for synchronous I/O, bp is invalid and associated 1208 * resources are freed on return. 1209 */ 1210 int 1211 biowait(struct buf *bp) 1212 { 1213 int error = 0; 1214 struct cpu *cpup; 1215 1216 ASSERT(SEMA_HELD(&bp->b_sem)); 1217 1218 cpup = CPU; 1219 atomic_add_64(&cpup->cpu_stats.sys.iowait, 1); 1220 DTRACE_IO1(wait__start, struct buf *, bp); 1221 1222 /* 1223 * In case of panic, busy wait for completion 1224 */ 1225 if (panicstr) { 1226 while ((bp->b_flags & B_DONE) == 0) 1227 drv_usecwait(10); 1228 } else 1229 sema_p(&bp->b_io); 1230 1231 DTRACE_IO1(wait__done, struct buf *, bp); 1232 atomic_add_64(&cpup->cpu_stats.sys.iowait, -1); 1233 1234 error = geterror(bp); 1235 if ((bp->b_flags & B_ASYNC) == 0) { 1236 if (bp->b_flags & B_REMAPPED) 1237 bp_mapout(bp); 1238 } 1239 return (error); 1240 } 1241 1242 static void 1243 biodone_tnf_probe(struct buf *bp) 1244 { 1245 /* Kernel probe */ 1246 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */, 1247 tnf_device, device, bp->b_edev, 1248 tnf_diskaddr, block, bp->b_lblkno, 1249 tnf_opaque, buf, bp); 1250 } 1251 1252 /* 1253 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 1254 * and wake up anyone waiting for it. 1255 */ 1256 void 1257 biodone(struct buf *bp) 1258 { 1259 if (bp->b_flags & B_STARTED) { 1260 DTRACE_IO1(done, struct buf *, bp); 1261 bp->b_flags &= ~B_STARTED; 1262 } 1263 1264 /* 1265 * Call the TNF probe here instead of the inline code 1266 * to force our compiler to use the tail call optimization. 1267 */ 1268 biodone_tnf_probe(bp); 1269 1270 if (bp->b_iodone != NULL) { 1271 (*(bp->b_iodone))(bp); 1272 return; 1273 } 1274 ASSERT((bp->b_flags & B_DONE) == 0); 1275 ASSERT(SEMA_HELD(&bp->b_sem)); 1276 bp->b_flags |= B_DONE; 1277 if (bp->b_flags & B_ASYNC) { 1278 if (bp->b_flags & (B_PAGEIO|B_REMAPPED)) 1279 bio_pageio_done(bp); 1280 else 1281 brelse(bp); /* release bp to freelist */ 1282 } else { 1283 sema_v(&bp->b_io); 1284 } 1285 } 1286 1287 /* 1288 * Pick up the device's error number and pass it to the user; 1289 * if there is an error but the number is 0 set a generalized code. 1290 */ 1291 int 1292 geterror(struct buf *bp) 1293 { 1294 int error = 0; 1295 1296 ASSERT(SEMA_HELD(&bp->b_sem)); 1297 if (bp->b_flags & B_ERROR) { 1298 error = bp->b_error; 1299 if (!error) 1300 error = EIO; 1301 } 1302 return (error); 1303 } 1304 1305 /* 1306 * Support for pageio buffers. 1307 * 1308 * This stuff should be generalized to provide a generalized bp 1309 * header facility that can be used for things other than pageio. 1310 */ 1311 1312 /* 1313 * Allocate and initialize a buf struct for use with pageio. 1314 */ 1315 struct buf * 1316 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) 1317 { 1318 struct buf *bp; 1319 struct cpu *cpup; 1320 1321 if (flags & B_READ) { 1322 CPU_STATS_ENTER_K(); 1323 cpup = CPU; /* get pointer AFTER preemption is disabled */ 1324 CPU_STATS_ADDQ(cpup, vm, pgin, 1); 1325 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); 1326 if ((flags & B_ASYNC) == 0) { 1327 klwp_t *lwp = ttolwp(curthread); 1328 if (lwp != NULL) 1329 lwp->lwp_ru.majflt++; 1330 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1); 1331 /* Kernel probe */ 1332 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */, 1333 tnf_opaque, vnode, pp->p_vnode, 1334 tnf_offset, offset, pp->p_offset); 1335 } 1336 /* 1337 * Update statistics for pages being paged in 1338 */ 1339 if (pp != NULL && pp->p_vnode != NULL) { 1340 if (IS_SWAPFSVP(pp->p_vnode)) { 1341 CPU_STATS_ADDQ(cpup, vm, anonpgin, 1342 btopr(len)); 1343 } else { 1344 if (pp->p_vnode->v_flag & VVMEXEC) { 1345 CPU_STATS_ADDQ(cpup, vm, execpgin, 1346 btopr(len)); 1347 } else { 1348 CPU_STATS_ADDQ(cpup, vm, fspgin, 1349 btopr(len)); 1350 } 1351 } 1352 } 1353 CPU_STATS_EXIT_K(); 1354 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN, 1355 "page_ws_in:pp %p", pp); 1356 /* Kernel probe */ 1357 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */, 1358 tnf_opaque, vnode, pp->p_vnode, 1359 tnf_offset, offset, pp->p_offset, 1360 tnf_size, size, len); 1361 } 1362 1363 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP); 1364 bp->b_bcount = len; 1365 bp->b_bufsize = len; 1366 bp->b_pages = pp; 1367 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags; 1368 bp->b_offset = -1; 1369 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 1370 1371 /* Initialize bp->b_sem in "locked" state */ 1372 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 1373 1374 VN_HOLD(vp); 1375 bp->b_vp = vp; 1376 THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */ 1377 1378 /* 1379 * Caller sets dev & blkno and can adjust 1380 * b_addr for page offset and can use bp_mapin 1381 * to make pages kernel addressable. 1382 */ 1383 return (bp); 1384 } 1385 1386 void 1387 pageio_done(struct buf *bp) 1388 { 1389 ASSERT(SEMA_HELD(&bp->b_sem)); 1390 if (bp->b_flags & B_REMAPPED) 1391 bp_mapout(bp); 1392 VN_RELE(bp->b_vp); 1393 bp->b_vp = NULL; 1394 ASSERT((bp->b_flags & B_NOCACHE) != 0); 1395 1396 /* A sema_v(bp->b_sem) is implied if we are destroying it */ 1397 sema_destroy(&bp->b_sem); 1398 sema_destroy(&bp->b_io); 1399 kmem_free(bp, sizeof (struct buf)); 1400 } 1401 1402 /* 1403 * Check to see whether the buffers, except the one pointed by sbp, 1404 * associated with the device are busy. 1405 * NOTE: This expensive operation shall be improved together with ufs_icheck(). 1406 */ 1407 int 1408 bcheck(dev_t dev, struct buf *sbp) 1409 { 1410 struct buf *bp; 1411 struct buf *dp; 1412 int i; 1413 kmutex_t *hmp; 1414 1415 /* 1416 * check for busy bufs for this filesystem 1417 */ 1418 for (i = 0; i < v.v_hbuf; i++) { 1419 dp = (struct buf *)&hbuf[i]; 1420 hmp = &hbuf[i].b_lock; 1421 1422 mutex_enter(hmp); 1423 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1424 /* 1425 * if buf is busy or dirty, then filesystem is busy 1426 */ 1427 if ((bp->b_edev == dev) && 1428 ((bp->b_flags & B_STALE) == 0) && 1429 (bp->b_flags & (B_DELWRI|B_BUSY)) && 1430 (bp != sbp)) { 1431 mutex_exit(hmp); 1432 return (1); 1433 } 1434 } 1435 mutex_exit(hmp); 1436 } 1437 return (0); 1438 } 1439 1440 /* 1441 * Hash two 32 bit entities. 1442 */ 1443 int 1444 hash2ints(int x, int y) 1445 { 1446 int hash = 0; 1447 1448 hash = x - 1; 1449 hash = ((hash * 7) + (x >> 8)) - 1; 1450 hash = ((hash * 7) + (x >> 16)) - 1; 1451 hash = ((hash * 7) + (x >> 24)) - 1; 1452 hash = ((hash * 7) + y) - 1; 1453 hash = ((hash * 7) + (y >> 8)) - 1; 1454 hash = ((hash * 7) + (y >> 16)) - 1; 1455 hash = ((hash * 7) + (y >> 24)) - 1; 1456 1457 return (hash); 1458 } 1459 1460 1461 /* 1462 * Return a new buffer struct. 1463 * Create a new buffer if we haven't gone over our high water 1464 * mark for memory, otherwise try to get one off the freelist. 1465 * 1466 * Returns a locked buf that has no id and is not on any hash or free 1467 * list. 1468 */ 1469 static struct buf * 1470 bio_getfreeblk(long bsize) 1471 { 1472 struct buf *bp, *dp; 1473 struct hbuf *hp; 1474 kmutex_t *hmp; 1475 uint_t start, end; 1476 1477 /* 1478 * mutex_enter(&bfree_lock); 1479 * bfreelist.b_bufsize represents the amount of memory 1480 * mutex_exit(&bfree_lock); protect ref to bfreelist 1481 * we are allowed to allocate in the cache before we hit our hwm. 1482 */ 1483 bio_mem_get(bsize); /* Account for our memory request */ 1484 1485 again: 1486 bp = bio_bhdr_alloc(); /* Get a buf hdr */ 1487 sema_p(&bp->b_sem); /* Should never fail */ 1488 1489 ASSERT(bp->b_un.b_addr == NULL); 1490 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP); 1491 if (bp->b_un.b_addr != NULL) { 1492 /* 1493 * Make the common path short 1494 */ 1495 bp->b_bufsize = bsize; 1496 ASSERT(SEMA_HELD(&bp->b_sem)); 1497 return (bp); 1498 } else { 1499 struct buf *save; 1500 1501 save = bp; /* Save bp we allocated */ 1502 start = end = lastindex; 1503 1504 biostats.bio_bufwant.value.ui32++; 1505 1506 /* 1507 * Memory isn't available from the system now. Scan 1508 * the hash buckets till enough space is found. 1509 */ 1510 do { 1511 hp = &hbuf[start]; 1512 hmp = &hp->b_lock; 1513 dp = (struct buf *)hp; 1514 1515 mutex_enter(hmp); 1516 bp = dp->av_forw; 1517 1518 while (bp != dp) { 1519 1520 ASSERT(bp != NULL); 1521 1522 if (!sema_tryp(&bp->b_sem)) { 1523 bp = bp->av_forw; 1524 continue; 1525 } 1526 1527 /* 1528 * Since we are going down the freelist 1529 * associated with this hash bucket the 1530 * B_DELWRI flag should not be set. 1531 */ 1532 ASSERT(!(bp->b_flags & B_DELWRI)); 1533 1534 if (bp->b_bufsize == bsize) { 1535 hp->b_length--; 1536 notavail(bp); 1537 bremhash(bp); 1538 mutex_exit(hmp); 1539 1540 /* 1541 * Didn't kmem_alloc any more, so don't 1542 * count it twice. 1543 */ 1544 mutex_enter(&bfree_lock); 1545 bfreelist.b_bufsize += bsize; 1546 mutex_exit(&bfree_lock); 1547 1548 /* 1549 * Update the lastindex value. 1550 */ 1551 lastindex = start; 1552 1553 /* 1554 * Put our saved bp back on the list 1555 */ 1556 sema_v(&save->b_sem); 1557 bio_bhdr_free(save); 1558 ASSERT(SEMA_HELD(&bp->b_sem)); 1559 return (bp); 1560 } 1561 sema_v(&bp->b_sem); 1562 bp = bp->av_forw; 1563 } 1564 mutex_exit(hmp); 1565 start = ((start + 1) % v.v_hbuf); 1566 } while (start != end); 1567 1568 biostats.bio_bufwait.value.ui32++; 1569 bp = save; /* Use original bp */ 1570 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 1571 } 1572 1573 bp->b_bufsize = bsize; 1574 ASSERT(SEMA_HELD(&bp->b_sem)); 1575 return (bp); 1576 } 1577 1578 /* 1579 * Allocate a buffer header. If none currently available, allocate 1580 * a new pool. 1581 */ 1582 static struct buf * 1583 bio_bhdr_alloc(void) 1584 { 1585 struct buf *dp, *sdp; 1586 struct buf *bp; 1587 int i; 1588 1589 for (;;) { 1590 mutex_enter(&bhdr_lock); 1591 if (bhdrlist != NULL) { 1592 bp = bhdrlist; 1593 bhdrlist = bp->av_forw; 1594 mutex_exit(&bhdr_lock); 1595 bp->av_forw = NULL; 1596 return (bp); 1597 } 1598 mutex_exit(&bhdr_lock); 1599 1600 /* 1601 * Need to allocate a new pool. If the system is currently 1602 * out of memory, then try freeing things on the freelist. 1603 */ 1604 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP); 1605 if (dp == NULL) { 1606 /* 1607 * System can't give us a pool of headers, try 1608 * recycling from the free lists. 1609 */ 1610 bio_recycle(BIO_HEADER, 0); 1611 } else { 1612 sdp = dp; 1613 for (i = 0; i < v.v_buf; i++, dp++) { 1614 /* 1615 * The next two lines are needed since NODEV 1616 * is -1 and not NULL 1617 */ 1618 dp->b_dev = (o_dev_t)NODEV; 1619 dp->b_edev = NODEV; 1620 dp->av_forw = dp + 1; 1621 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT, 1622 NULL); 1623 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT, 1624 NULL); 1625 dp->b_offset = -1; 1626 } 1627 mutex_enter(&bhdr_lock); 1628 (--dp)->av_forw = bhdrlist; /* Fix last pointer */ 1629 bhdrlist = sdp; 1630 nbuf += v.v_buf; 1631 bp = bhdrlist; 1632 bhdrlist = bp->av_forw; 1633 mutex_exit(&bhdr_lock); 1634 1635 bp->av_forw = NULL; 1636 return (bp); 1637 } 1638 } 1639 } 1640 1641 static void 1642 bio_bhdr_free(struct buf *bp) 1643 { 1644 ASSERT(bp->b_back == NULL); 1645 ASSERT(bp->b_forw == NULL); 1646 ASSERT(bp->av_back == NULL); 1647 ASSERT(bp->av_forw == NULL); 1648 ASSERT(bp->b_un.b_addr == NULL); 1649 ASSERT(bp->b_dev == (o_dev_t)NODEV); 1650 ASSERT(bp->b_edev == NODEV); 1651 ASSERT(bp->b_flags == 0); 1652 1653 mutex_enter(&bhdr_lock); 1654 bp->av_forw = bhdrlist; 1655 bhdrlist = bp; 1656 mutex_exit(&bhdr_lock); 1657 } 1658 1659 /* 1660 * If we haven't gone over the high water mark, it's o.k. to 1661 * allocate more buffer space, otherwise recycle buffers 1662 * from the freelist until enough memory is free for a bsize request. 1663 * 1664 * We account for this memory, even though 1665 * we don't allocate it here. 1666 */ 1667 static void 1668 bio_mem_get(long bsize) 1669 { 1670 mutex_enter(&bfree_lock); 1671 if (bfreelist.b_bufsize > bsize) { 1672 bfreelist.b_bufsize -= bsize; 1673 mutex_exit(&bfree_lock); 1674 return; 1675 } 1676 mutex_exit(&bfree_lock); 1677 bio_recycle(BIO_MEM, bsize); 1678 } 1679 1680 /* 1681 * flush a list of delayed write buffers. 1682 * (currently used only by bio_recycle below.) 1683 */ 1684 static void 1685 bio_flushlist(struct buf *delwri_list) 1686 { 1687 struct buf *bp; 1688 1689 while (delwri_list != EMPTY_LIST) { 1690 bp = delwri_list; 1691 bp->b_flags |= B_AGE | B_ASYNC; 1692 if (bp->b_vp == NULL) { /* !ufs */ 1693 BWRITE(bp); 1694 } else { /* ufs */ 1695 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 1696 } 1697 delwri_list = bp->b_list; 1698 bp->b_list = NULL; 1699 } 1700 } 1701 1702 /* 1703 * Start recycling buffers on the freelist for one of 2 reasons: 1704 * - we need a buffer header 1705 * - we need to free up memory 1706 * Once started we continue to recycle buffers until the B_AGE 1707 * buffers are gone. 1708 */ 1709 static void 1710 bio_recycle(int want, long bsize) 1711 { 1712 struct buf *bp, *dp, *dwp, *nbp; 1713 struct hbuf *hp; 1714 int found = 0; 1715 kmutex_t *hmp; 1716 int start, end; 1717 struct buf *delwri_list = EMPTY_LIST; 1718 1719 /* 1720 * Recycle buffers. 1721 */ 1722 top: 1723 start = end = lastindex; 1724 do { 1725 hp = &hbuf[start]; 1726 hmp = &hp->b_lock; 1727 dp = (struct buf *)hp; 1728 1729 mutex_enter(hmp); 1730 bp = dp->av_forw; 1731 1732 while (bp != dp) { 1733 1734 ASSERT(bp != NULL); 1735 1736 if (!sema_tryp(&bp->b_sem)) { 1737 bp = bp->av_forw; 1738 continue; 1739 } 1740 /* 1741 * Do we really want to nuke all of the B_AGE stuff?? 1742 */ 1743 if ((bp->b_flags & B_AGE) == 0 && found) { 1744 sema_v(&bp->b_sem); 1745 mutex_exit(hmp); 1746 lastindex = start; 1747 return; /* All done */ 1748 } 1749 1750 ASSERT(MUTEX_HELD(&hp->b_lock)); 1751 ASSERT(!(bp->b_flags & B_DELWRI)); 1752 hp->b_length--; 1753 notavail(bp); 1754 1755 /* 1756 * Remove bhdr from cache, free up memory, 1757 * and add the hdr to the freelist. 1758 */ 1759 bremhash(bp); 1760 mutex_exit(hmp); 1761 1762 if (bp->b_bufsize) { 1763 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 1764 bp->b_un.b_addr = NULL; 1765 mutex_enter(&bfree_lock); 1766 bfreelist.b_bufsize += bp->b_bufsize; 1767 mutex_exit(&bfree_lock); 1768 } 1769 1770 bp->b_dev = (o_dev_t)NODEV; 1771 bp->b_edev = NODEV; 1772 bp->b_flags = 0; 1773 sema_v(&bp->b_sem); 1774 bio_bhdr_free(bp); 1775 if (want == BIO_HEADER) { 1776 found = 1; 1777 } else { 1778 ASSERT(want == BIO_MEM); 1779 if (!found && bfreelist.b_bufsize >= bsize) { 1780 /* Account for the memory we want */ 1781 mutex_enter(&bfree_lock); 1782 if (bfreelist.b_bufsize >= bsize) { 1783 bfreelist.b_bufsize -= bsize; 1784 found = 1; 1785 } 1786 mutex_exit(&bfree_lock); 1787 } 1788 } 1789 1790 /* 1791 * Since we dropped hmp start from the 1792 * begining. 1793 */ 1794 mutex_enter(hmp); 1795 bp = dp->av_forw; 1796 } 1797 mutex_exit(hmp); 1798 1799 /* 1800 * Look at the delayed write list. 1801 * First gather into a private list, then write them. 1802 */ 1803 dwp = (struct buf *)&dwbuf[start]; 1804 mutex_enter(&blist_lock); 1805 bio_doingflush++; 1806 mutex_enter(hmp); 1807 for (bp = dwp->av_forw; bp != dwp; bp = nbp) { 1808 1809 ASSERT(bp != NULL); 1810 nbp = bp->av_forw; 1811 1812 if (!sema_tryp(&bp->b_sem)) 1813 continue; 1814 ASSERT(bp->b_flags & B_DELWRI); 1815 /* 1816 * Do we really want to nuke all of the B_AGE stuff?? 1817 */ 1818 1819 if ((bp->b_flags & B_AGE) == 0 && found) { 1820 sema_v(&bp->b_sem); 1821 mutex_exit(hmp); 1822 lastindex = start; 1823 mutex_exit(&blist_lock); 1824 bio_flushlist(delwri_list); 1825 mutex_enter(&blist_lock); 1826 bio_doingflush--; 1827 if (bio_flinv_cv_wanted) { 1828 bio_flinv_cv_wanted = 0; 1829 cv_broadcast(&bio_flushinval_cv); 1830 } 1831 mutex_exit(&blist_lock); 1832 return; /* All done */ 1833 } 1834 1835 /* 1836 * If the buffer is already on a flush or 1837 * invalidate list then just skip it. 1838 */ 1839 if (bp->b_list != NULL) { 1840 sema_v(&bp->b_sem); 1841 continue; 1842 } 1843 /* 1844 * We are still on the same bucket. 1845 */ 1846 hp->b_length--; 1847 notavail(bp); 1848 bp->b_list = delwri_list; 1849 delwri_list = bp; 1850 } 1851 mutex_exit(hmp); 1852 mutex_exit(&blist_lock); 1853 bio_flushlist(delwri_list); 1854 delwri_list = EMPTY_LIST; 1855 mutex_enter(&blist_lock); 1856 bio_doingflush--; 1857 if (bio_flinv_cv_wanted) { 1858 bio_flinv_cv_wanted = 0; 1859 cv_broadcast(&bio_flushinval_cv); 1860 } 1861 mutex_exit(&blist_lock); 1862 start = (start + 1) % v.v_hbuf; 1863 1864 } while (start != end); 1865 1866 if (found) 1867 return; 1868 1869 /* 1870 * Free lists exhausted and we haven't satisfied the request. 1871 * Wait here for more entries to be added to freelist. 1872 * Because this might have just happened, make it timed. 1873 */ 1874 mutex_enter(&bfree_lock); 1875 bfreelist.b_flags |= B_WANTED; 1876 (void) cv_timedwait(&bio_mem_cv, &bfree_lock, lbolt+hz); 1877 mutex_exit(&bfree_lock); 1878 goto top; 1879 } 1880 1881 /* 1882 * See if the block is associated with some buffer 1883 * (mainly to avoid getting hung up on a wait in breada). 1884 */ 1885 static int 1886 bio_incore(dev_t dev, daddr_t blkno) 1887 { 1888 struct buf *bp; 1889 struct buf *dp; 1890 uint_t index; 1891 kmutex_t *hmp; 1892 1893 index = bio_bhash(dev, blkno); 1894 dp = (struct buf *)&hbuf[index]; 1895 hmp = &hbuf[index].b_lock; 1896 1897 mutex_enter(hmp); 1898 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1899 if (bp->b_blkno == blkno && bp->b_edev == dev && 1900 (bp->b_flags & B_STALE) == 0) { 1901 mutex_exit(hmp); 1902 return (1); 1903 } 1904 } 1905 mutex_exit(hmp); 1906 return (0); 1907 } 1908 1909 static void 1910 bio_pageio_done(struct buf *bp) 1911 { 1912 if (bp->b_flags & B_PAGEIO) { 1913 1914 if (bp->b_flags & B_REMAPPED) 1915 bp_mapout(bp); 1916 1917 if (bp->b_flags & B_READ) 1918 pvn_read_done(bp->b_pages, bp->b_flags); 1919 else 1920 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags); 1921 pageio_done(bp); 1922 } else { 1923 ASSERT(bp->b_flags & B_REMAPPED); 1924 bp_mapout(bp); 1925 brelse(bp); 1926 } 1927 } 1928 1929 /* 1930 * bioerror(9F) - indicate error in buffer header 1931 * If 'error' is zero, remove the error indication. 1932 */ 1933 void 1934 bioerror(struct buf *bp, int error) 1935 { 1936 ASSERT(bp != NULL); 1937 ASSERT(error >= 0); 1938 ASSERT(SEMA_HELD(&bp->b_sem)); 1939 1940 if (error != 0) { 1941 bp->b_flags |= B_ERROR; 1942 } else { 1943 bp->b_flags &= ~B_ERROR; 1944 } 1945 bp->b_error = error; 1946 } 1947 1948 /* 1949 * bioreset(9F) - reuse a private buffer header after I/O is complete 1950 */ 1951 void 1952 bioreset(struct buf *bp) 1953 { 1954 ASSERT(bp != NULL); 1955 1956 biofini(bp); 1957 bioinit(bp); 1958 } 1959 1960 /* 1961 * biosize(9F) - return size of a buffer header 1962 */ 1963 size_t 1964 biosize(void) 1965 { 1966 return (sizeof (struct buf)); 1967 } 1968 1969 /* 1970 * biomodified(9F) - check if buffer is modified 1971 */ 1972 int 1973 biomodified(struct buf *bp) 1974 { 1975 int npf; 1976 int ppattr; 1977 struct page *pp; 1978 1979 ASSERT(bp != NULL); 1980 1981 if ((bp->b_flags & B_PAGEIO) == 0) { 1982 return (-1); 1983 } 1984 pp = bp->b_pages; 1985 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET)); 1986 1987 while (npf > 0) { 1988 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 1989 HAT_SYNC_STOPON_MOD); 1990 if (ppattr & P_MOD) 1991 return (1); 1992 pp = pp->p_next; 1993 npf--; 1994 } 1995 1996 return (0); 1997 } 1998 1999 /* 2000 * bioinit(9F) - initialize a buffer structure 2001 */ 2002 void 2003 bioinit(struct buf *bp) 2004 { 2005 bzero(bp, sizeof (struct buf)); 2006 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 2007 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 2008 bp->b_offset = -1; 2009 } 2010 2011 /* 2012 * biofini(9F) - uninitialize a buffer structure 2013 */ 2014 void 2015 biofini(struct buf *bp) 2016 { 2017 sema_destroy(&bp->b_io); 2018 sema_destroy(&bp->b_sem); 2019 } 2020 2021 /* 2022 * bioclone(9F) - clone a buffer 2023 */ 2024 struct buf * 2025 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno, 2026 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep) 2027 { 2028 struct buf *bufp; 2029 2030 ASSERT(bp); 2031 if (bp_mem == NULL) { 2032 bufp = kmem_alloc(sizeof (struct buf), sleep); 2033 if (bufp == NULL) { 2034 return (NULL); 2035 } 2036 bioinit(bufp); 2037 } else { 2038 bufp = bp_mem; 2039 bioreset(bufp); 2040 } 2041 2042 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\ 2043 B_ABRWRITE) 2044 2045 /* 2046 * the cloned buffer does not inherit the B_REMAPPED flag. A separate 2047 * bp_mapin(9F) has to be done to get a kernel mapping. 2048 */ 2049 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY; 2050 bufp->b_bcount = len; 2051 bufp->b_blkno = blkno; 2052 bufp->b_iodone = iodone; 2053 bufp->b_proc = bp->b_proc; 2054 bufp->b_edev = dev; 2055 bufp->b_file = bp->b_file; 2056 bufp->b_offset = bp->b_offset; 2057 2058 if (bp->b_flags & B_SHADOW) { 2059 ASSERT(bp->b_shadow); 2060 ASSERT(bp->b_flags & B_PHYS); 2061 2062 bufp->b_shadow = bp->b_shadow + 2063 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off); 2064 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2065 } else { 2066 if (bp->b_flags & B_PAGEIO) { 2067 struct page *pp; 2068 off_t o; 2069 int i; 2070 2071 pp = bp->b_pages; 2072 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off; 2073 for (i = btop(o); i > 0; i--) { 2074 pp = pp->p_next; 2075 } 2076 bufp->b_pages = pp; 2077 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET); 2078 } else { 2079 bufp->b_un.b_addr = 2080 (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2081 if (bp->b_flags & B_REMAPPED) 2082 bufp->b_proc = NULL; 2083 } 2084 } 2085 return (bufp); 2086 } 2087